diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
deleted file mode 100644
index fa14ca12f337280b15a3bbfa258ffa590c0dbfe3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.7835870284579749
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
deleted file mode 100644
index 622b0178d3dc61824ffc4b9d954e29bbd0185009..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.9291859695565851
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
deleted file mode 100644
index f3e3d5afe9f834ed904e4b4e7eda2b9f6baf767e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.6624751819986764
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
deleted file mode 100644
index 4210ba765770eac93980a029b2bc352c8972c508..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.9252150893448048
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
deleted file mode 100644
index f560c7f8871d2369ce47abd12e53e7667e83db89..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.9159497021839841
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
deleted file mode 100644
index 4a8c7a2f686d8f7e86828e16fbfd066edfdf6cd0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.870946393117141
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
deleted file mode 100644
index cdfab5115b88fdcc96feeb85dd4f93ee938b65d7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.9523494374586366
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
deleted file mode 100644
index 49cbe5f2b61f12b9305cf9fb741f260ad3487a6f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.7319655857048313
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
deleted file mode 100644
index 84abc297253d8c49058803f80b2439bbec58dc0c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.9477167438782264
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 133194939f44992d70f7b27f95d3d923898d3a47..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.9516876240900066
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
deleted file mode 100644
index 9f90e311862c4d4f544147953fedfe8b2e09bf27..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.6982131039046989
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
deleted file mode 100644
index ac3a09567cd16940b268699baf757d683c11d3e2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.85704831237591
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
deleted file mode 100644
index 537bd7aca6cf5b4d60179d3e77429be8d2472ffa..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.614162806088683
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
deleted file mode 100644
index 7b613d66b0fa70d7ede65259912c94c972606b72..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.8590337524818001
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
deleted file mode 100644
index b78868605b4fc2d566a9cf12cfbd26fecb04615f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8504301786896096
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
deleted file mode 100644
index dfcddff9122832e4023de739a08b165405f4bf97..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.7683653209794837
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
deleted file mode 100644
index 0249f476c61770d371aa612ab6f723bec18d3c63..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.8742554599602912
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
deleted file mode 100644
index 15715bbec8f0ca5626c8896695ab1ec869daef82..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.657180675049636
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
deleted file mode 100644
index 2825f4a6c0a15ea38107bcf743d17899fc10205c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.886829913964262
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 9c3a3be4b72ddd989d75644db452c9a43117588c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8762409000661814
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
deleted file mode 100644
index b5f961fd1a7f2e5d4fcc0cf3b9b24c3d5a3636fb..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.8332230311052283
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
deleted file mode 100644
index 08e1da3e2e4a3c184f8a9616f90c0c499646325e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.913964262078094
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
deleted file mode 100644
index 1c19d4576266454a6162862c53b416bebe0d6b03..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.700198544010589
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
deleted file mode 100644
index 61259b0ab613799ec9d0e717e6b66e3e5676e6de..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.9205823957643945
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 0599dff562b06459a0b386388649b3fdb2d795d3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.9086697551290536
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
deleted file mode 100644
index d4fea73deeac81c8a3056ea7163c644193143f38..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.870946393117141
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
deleted file mode 100644
index 50a8a9f7bcc5d647795ef2452673fb071e542ef9..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.9265387160820648
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
deleted file mode 100644
index 927c5f705c7b4716a9e99d27d3dde0bd724ba42c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.6823295830575777
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
deleted file mode 100644
index b1a9390ed70408e37d3597045e7b9ec029eee5b4..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.928524156187955
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
deleted file mode 100644
index c912504a65bd932e2813c6dfbec4dfa31c750e9f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.9232296492389146
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
deleted file mode 100644
index e8a5a22b9790053662904d0e281b3325e8779d96..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.6933333333333334
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
deleted file mode 100644
index fb72eae6a032d146ba4bda652b3c2831cb267690..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5212903225806451
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
deleted file mode 100644
index 66aa24f4760f073f17cd55f9e6a510c174ad2875..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6563440860215054
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
deleted file mode 100644
index 96dd97d1f840cfe0c4253ab2c2c375afba165614..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5156989247311828
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
deleted file mode 100644
index 6ea1d115d892e712d5a15507f7c53b35973f58e6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6473118279569893
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
deleted file mode 100644
index 0e44039abfed86c270ecfd42407b4a632f5beb6a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.6024096385542169
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
deleted file mode 100644
index 676ff78442f70e7604c9d4e4a162104fba700bc5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.46987951807228917
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
deleted file mode 100644
index 57fa5bc71b075b999289da2b81ab4cbd13217d92..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5903614457831325
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
deleted file mode 100644
index aca34b8ede42cbef1f40485a79e370482f93eb7f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.4939759036144578
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
deleted file mode 100644
index 9e52bc4b92c7834bfbc21e02c80d8ec944c6eb26..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6867469879518072
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
deleted file mode 100644
index 3a6b4ece370951cdb9b44db9ac165944227af945..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.6463878326996197
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
deleted file mode 100644
index 14851a8471b73c07a28c1b00f9b88a4aa6722984..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5285171102661597
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
deleted file mode 100644
index 091ffea556f32c5e4e5869762a4c921446a22f69..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6007604562737643
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
deleted file mode 100644
index 310ef9e24ba6109c7c26de82e901349762c9471a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.49809885931558934
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
deleted file mode 100644
index ef31c9a05fdae650875c5ab14ba65ad812396606..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6083650190114068
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
deleted file mode 100644
index 1fbc6d83b73966e85bc5e1da6f2cd012adba195a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.7063492063492064
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
deleted file mode 100644
index 51ed558709378889fed0678b936ed71050d4f2db..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5515873015873016
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
deleted file mode 100644
index f0f243e26825b9625fdde9c5dd2af458fce410ac..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.621031746031746
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
deleted file mode 100644
index 8ee2b197de90b9471d5898fed01f8d44b6ffbe7f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5158730158730159
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
deleted file mode 100644
index bb725b17616be084da49f9d036df24a26aa8fc3e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6765873015873016
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
deleted file mode 100644
index 624ef5ca1c606f97954ef9489cd9b411b4aacdd9..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.497
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
deleted file mode 100644
index 17bea05f2e6e2b5dd592b8a4f29bb60008648757..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.442
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/can_we_infer/results.json
deleted file mode 100644
index 724d47c0dc6f6ef4d49e9caf8b4b2b63b9b58ac7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.456
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 5309d4d5e0a389c68d33eac1f803108aec5560b6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.328
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
deleted file mode 100644
index c116ebcd4d24a3000e5a7370b54433dece9b944e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.46
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
deleted file mode 100644
index eccf8c1551bd97f197744e7516bcf00470f66e65..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.45
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
deleted file mode 100644
index 42e7d317c933d0b4944e0282146dc394124acb68..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.382
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/can_we_infer/results.json
deleted file mode 100644
index bcf2167c91a3a87a89a39b4ba636a92c1c0499ee..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.419
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 3b5e80243b6a064cff22a8ae1358d3a1065f5cc6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.345
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
deleted file mode 100644
index 104e8bc71bf21d47b3390778a2bae08084763b39..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.41
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
deleted file mode 100644
index ceae21ce359e3db0adf8516d4199124caa0e5a81..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.4558333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
deleted file mode 100644
index 5e133c1acdb994634d864698c93a5014e3f47019..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.41333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/can_we_infer/results.json
deleted file mode 100644
index dc8705fa2dffaf0d5ab82b34ab8e28b6c4c9ae1e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.4225
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
deleted file mode 100644
index dc1c9c2cbed6db1dba1a6b120cd962090e1c94b9..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.305
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
deleted file mode 100644
index 81a1ad51681d7dba238377f77944974385162d26..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4083333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/merged.csv b/evaluation_bloomz-mt/evaluation_l1/merged.csv
deleted file mode 100644
index e22f6c3d3ab721a945ee943a9fe18ad51e742d92..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/merged.csv
+++ /dev/null
@@ -1,194 +0,0 @@
-dataset,prompt,metric,value
-anli_dev_r1,GPT-3 style,accuracy,0.497
-anli_dev_r1,MNLI crowdsource,accuracy,0.442
-anli_dev_r1,can we infer,accuracy,0.456
-anli_dev_r1,guaranteed/possible/impossible,accuracy,0.328
-anli_dev_r1,justified in saying,accuracy,0.46
-anli_dev_r1,median,accuracy,0.456
-anli_dev_r2,GPT-3 style,accuracy,0.45
-anli_dev_r2,MNLI crowdsource,accuracy,0.382
-anli_dev_r2,can we infer,accuracy,0.419
-anli_dev_r2,guaranteed/possible/impossible,accuracy,0.345
-anli_dev_r2,justified in saying,accuracy,0.41
-anli_dev_r2,median,accuracy,0.41
-anli_dev_r3,GPT-3 style,accuracy,0.4558333333333333
-anli_dev_r3,MNLI crowdsource,accuracy,0.41333333333333333
-anli_dev_r3,can we infer,accuracy,0.4225
-anli_dev_r3,guaranteed/possible/impossible,accuracy,0.305
-anli_dev_r3,justified in saying,accuracy,0.4083333333333333
-anli_dev_r3,median,accuracy,0.41333333333333333
-story_cloze_2016,Answer Given options,accuracy,0.9524318546231961
-story_cloze_2016,Choose Story Ending,accuracy,0.9668626402993051
-story_cloze_2016,Generate Ending,accuracy,0.7760555852485302
-story_cloze_2016,Novel Correct Ending,accuracy,0.9583110636023516
-story_cloze_2016,Story Continuation and Options,accuracy,0.9593800106894709
-story_cloze_2016,median,accuracy,0.9583110636023516
-super_glue_cb,GPT-3 style,accuracy,0.875
-super_glue_cb,MNLI crowdsource,accuracy,0.35714285714285715
-super_glue_cb,can we infer,accuracy,0.75
-super_glue_cb,guaranteed/possible/impossible,accuracy,0.7678571428571429
-super_glue_cb,justified in saying,accuracy,0.8035714285714286
-super_glue_cb,median,accuracy,0.7678571428571429
-super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.75
-super_glue_copa,best_option,accuracy,0.87
-super_glue_copa,cause_effect,accuracy,0.9
-super_glue_copa,i_am_hesitating,accuracy,0.91
-super_glue_copa,plausible_alternatives,accuracy,0.91
-super_glue_copa,median,accuracy,0.9
-super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
-super_glue_rte,MNLI crowdsource,accuracy,0.8592057761732852
-super_glue_rte,does it follow that,accuracy,0.8194945848375451
-super_glue_rte,guaranteed true,accuracy,0.7942238267148014
-super_glue_rte,should assume,accuracy,0.8122743682310469
-super_glue_rte,median,accuracy,0.8122743682310469
-winogrande_winogrande_xl,Replace,accuracy,0.5998421468034728
-winogrande_winogrande_xl,True or False,accuracy,0.5359116022099447
-winogrande_winogrande_xl,does underscore refer to,accuracy,0.5864246250986582
-winogrande_winogrande_xl,stand for,accuracy,0.5201262825572218
-winogrande_winogrande_xl,underscore refer to,accuracy,0.5880031570639306
-winogrande_winogrande_xl,median,accuracy,0.5864246250986582
-xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.56
-xcopa_id,best_option,accuracy,0.81
-xcopa_id,cause_effect,accuracy,0.87
-xcopa_id,i_am_hesitating,accuracy,0.83
-xcopa_id,plausible_alternatives,accuracy,0.87
-xcopa_id,median,accuracy,0.83
-xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.6
-xcopa_sw,best_option,accuracy,0.62
-xcopa_sw,cause_effect,accuracy,0.64
-xcopa_sw,i_am_hesitating,accuracy,0.66
-xcopa_sw,plausible_alternatives,accuracy,0.64
-xcopa_sw,median,accuracy,0.64
-xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.59
-xcopa_ta,best_option,accuracy,0.66
-xcopa_ta,cause_effect,accuracy,0.7
-xcopa_ta,i_am_hesitating,accuracy,0.69
-xcopa_ta,plausible_alternatives,accuracy,0.64
-xcopa_ta,median,accuracy,0.66
-xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.58
-xcopa_vi,best_option,accuracy,0.81
-xcopa_vi,cause_effect,accuracy,0.91
-xcopa_vi,i_am_hesitating,accuracy,0.85
-xcopa_vi,plausible_alternatives,accuracy,0.84
-xcopa_vi,median,accuracy,0.84
-xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.57
-xcopa_zh,best_option,accuracy,0.84
-xcopa_zh,cause_effect,accuracy,0.86
-xcopa_zh,i_am_hesitating,accuracy,0.86
-xcopa_zh,plausible_alternatives,accuracy,0.81
-xcopa_zh,median,accuracy,0.84
-xnli_ar,GPT-3 style,accuracy,0.5578313253012048
-xnli_ar,MNLI crowdsource,accuracy,0.41164658634538154
-xnli_ar,can we infer,accuracy,0.5152610441767068
-xnli_ar,guaranteed/possible/impossible,accuracy,0.5803212851405622
-xnli_ar,justified in saying,accuracy,0.5184738955823294
-xnli_ar,median,accuracy,0.5184738955823294
-xnli_en,GPT-3 style,accuracy,0.6176706827309237
-xnli_en,MNLI crowdsource,accuracy,0.4606425702811245
-xnli_en,can we infer,accuracy,0.5714859437751004
-xnli_en,guaranteed/possible/impossible,accuracy,0.6180722891566265
-xnli_en,justified in saying,accuracy,0.5746987951807229
-xnli_en,median,accuracy,0.5746987951807229
-xnli_es,GPT-3 style,accuracy,0.5911646586345382
-xnli_es,MNLI crowdsource,accuracy,0.43052208835341366
-xnli_es,can we infer,accuracy,0.4397590361445783
-xnli_es,guaranteed/possible/impossible,accuracy,0.5208835341365462
-xnli_es,justified in saying,accuracy,0.41726907630522087
-xnli_es,median,accuracy,0.4397590361445783
-xnli_fr,GPT-3 style,accuracy,0.5911646586345382
-xnli_fr,MNLI crowdsource,accuracy,0.4321285140562249
-xnli_fr,can we infer,accuracy,0.5369477911646586
-xnli_fr,guaranteed/possible/impossible,accuracy,0.5176706827309236
-xnli_fr,justified in saying,accuracy,0.5385542168674698
-xnli_fr,median,accuracy,0.5369477911646586
-xnli_hi,GPT-3 style,accuracy,0.5208835341365462
-xnli_hi,MNLI crowdsource,accuracy,0.3819277108433735
-xnli_hi,can we infer,accuracy,0.44176706827309237
-xnli_hi,guaranteed/possible/impossible,accuracy,0.5253012048192771
-xnli_hi,justified in saying,accuracy,0.44377510040160645
-xnli_hi,median,accuracy,0.44377510040160645
-xnli_sw,GPT-3 style,accuracy,0.5036144578313253
-xnli_sw,MNLI crowdsource,accuracy,0.3887550200803213
-xnli_sw,can we infer,accuracy,0.44216867469879517
-xnli_sw,guaranteed/possible/impossible,accuracy,0.38795180722891565
-xnli_sw,justified in saying,accuracy,0.4397590361445783
-xnli_sw,median,accuracy,0.4397590361445783
-xnli_ur,GPT-3 style,accuracy,0.4907630522088353
-xnli_ur,MNLI crowdsource,accuracy,0.37309236947791163
-xnli_ur,can we infer,accuracy,0.45863453815261046
-xnli_ur,guaranteed/possible/impossible,accuracy,0.5124497991967871
-xnli_ur,justified in saying,accuracy,0.45582329317269077
-xnli_ur,median,accuracy,0.45863453815261046
-xnli_vi,GPT-3 style,accuracy,0.5582329317269076
-xnli_vi,MNLI crowdsource,accuracy,0.42690763052208835
-xnli_vi,can we infer,accuracy,0.4759036144578313
-xnli_vi,guaranteed/possible/impossible,accuracy,0.5008032128514056
-xnli_vi,justified in saying,accuracy,0.4827309236947791
-xnli_vi,median,accuracy,0.4827309236947791
-xnli_zh,GPT-3 style,accuracy,0.5550200803212851
-xnli_zh,MNLI crowdsource,accuracy,0.4248995983935743
-xnli_zh,can we infer,accuracy,0.43052208835341366
-xnli_zh,guaranteed/possible/impossible,accuracy,0.5526104417670683
-xnli_zh,justified in saying,accuracy,0.44016064257028115
-xnli_zh,median,accuracy,0.44016064257028115
-xstory_cloze_ar,Answer Given options,accuracy,0.7835870284579749
-xstory_cloze_ar,Choose Story Ending,accuracy,0.9291859695565851
-xstory_cloze_ar,Generate Ending,accuracy,0.6624751819986764
-xstory_cloze_ar,Novel Correct Ending,accuracy,0.9252150893448048
-xstory_cloze_ar,Story Continuation and Options,accuracy,0.9159497021839841
-xstory_cloze_ar,median,accuracy,0.9159497021839841
-xstory_cloze_es,Answer Given options,accuracy,0.870946393117141
-xstory_cloze_es,Choose Story Ending,accuracy,0.9523494374586366
-xstory_cloze_es,Generate Ending,accuracy,0.7319655857048313
-xstory_cloze_es,Novel Correct Ending,accuracy,0.9477167438782264
-xstory_cloze_es,Story Continuation and Options,accuracy,0.9516876240900066
-xstory_cloze_es,median,accuracy,0.9477167438782264
-xstory_cloze_eu,Answer Given options,accuracy,0.6982131039046989
-xstory_cloze_eu,Choose Story Ending,accuracy,0.85704831237591
-xstory_cloze_eu,Generate Ending,accuracy,0.614162806088683
-xstory_cloze_eu,Novel Correct Ending,accuracy,0.8590337524818001
-xstory_cloze_eu,Story Continuation and Options,accuracy,0.8504301786896096
-xstory_cloze_eu,median,accuracy,0.8504301786896096
-xstory_cloze_hi,Answer Given options,accuracy,0.7683653209794837
-xstory_cloze_hi,Choose Story Ending,accuracy,0.8742554599602912
-xstory_cloze_hi,Generate Ending,accuracy,0.657180675049636
-xstory_cloze_hi,Novel Correct Ending,accuracy,0.886829913964262
-xstory_cloze_hi,Story Continuation and Options,accuracy,0.8762409000661814
-xstory_cloze_hi,median,accuracy,0.8742554599602912
-xstory_cloze_id,Answer Given options,accuracy,0.8332230311052283
-xstory_cloze_id,Choose Story Ending,accuracy,0.913964262078094
-xstory_cloze_id,Generate Ending,accuracy,0.700198544010589
-xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
-xstory_cloze_id,Story Continuation and Options,accuracy,0.9086697551290536
-xstory_cloze_id,median,accuracy,0.9086697551290536
-xstory_cloze_zh,Answer Given options,accuracy,0.870946393117141
-xstory_cloze_zh,Choose Story Ending,accuracy,0.9265387160820648
-xstory_cloze_zh,Generate Ending,accuracy,0.6823295830575777
-xstory_cloze_zh,Novel Correct Ending,accuracy,0.928524156187955
-xstory_cloze_zh,Story Continuation and Options,accuracy,0.9232296492389146
-xstory_cloze_zh,median,accuracy,0.9232296492389146
-xwinograd_en,Replace,accuracy,0.6933333333333334
-xwinograd_en,True or False,accuracy,0.5212903225806451
-xwinograd_en,does underscore refer to,accuracy,0.6563440860215054
-xwinograd_en,stand for,accuracy,0.5156989247311828
-xwinograd_en,underscore refer to,accuracy,0.6473118279569893
-xwinograd_en,median,accuracy,0.6473118279569893
-xwinograd_fr,Replace,accuracy,0.6024096385542169
-xwinograd_fr,True or False,accuracy,0.46987951807228917
-xwinograd_fr,does underscore refer to,accuracy,0.5903614457831325
-xwinograd_fr,stand for,accuracy,0.4939759036144578
-xwinograd_fr,underscore refer to,accuracy,0.6867469879518072
-xwinograd_fr,median,accuracy,0.5903614457831325
-xwinograd_pt,Replace,accuracy,0.6463878326996197
-xwinograd_pt,True or False,accuracy,0.5285171102661597
-xwinograd_pt,does underscore refer to,accuracy,0.6007604562737643
-xwinograd_pt,stand for,accuracy,0.49809885931558934
-xwinograd_pt,underscore refer to,accuracy,0.6083650190114068
-xwinograd_pt,median,accuracy,0.6007604562737643
-xwinograd_zh,Replace,accuracy,0.7063492063492064
-xwinograd_zh,True or False,accuracy,0.5515873015873016
-xwinograd_zh,does underscore refer to,accuracy,0.621031746031746
-xwinograd_zh,stand for,accuracy,0.5158730158730159
-xwinograd_zh,underscore refer to,accuracy,0.6765873015873016
-xwinograd_zh,median,accuracy,0.621031746031746
-multiple,average,multiple,0.6665267892901372
diff --git a/evaluation_bloomz-mt/evaluation_l1/merged.json b/evaluation_bloomz-mt/evaluation_l1/merged.json
deleted file mode 100644
index 501d6ffa6ac3a5c6e3c21ad460331ab55203d9b0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"Muennighoff/xstory_cloze_ar": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7835870284579749}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9291859695565851}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6624751819986764}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9252150893448048}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9159497021839841}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.870946393117141}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9523494374586366}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7319655857048313}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9477167438782264}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9516876240900066}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6982131039046989}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.85704831237591}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.614162806088683}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8590337524818001}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8504301786896096}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7683653209794837}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8742554599602912}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.657180675049636}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.886829913964262}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8762409000661814}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8332230311052283}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.913964262078094}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.700198544010589}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9205823957643945}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9086697551290536}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.870946393117141}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9265387160820648}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6823295830575777}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.928524156187955}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9232296492389146}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xwinograd_en": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6933333333333334}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5212903225806451}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6563440860215054}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5156989247311828}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6473118279569893}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_fr": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6024096385542169}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.46987951807228917}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5903614457831325}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4939759036144578}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6867469879518072}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_pt": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6463878326996197}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5285171102661597}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6007604562737643}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49809885931558934}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6083650190114068}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_zh": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7063492063492064}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5515873015873016}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.621031746031746}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5158730158730159}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6765873015873016}, "template_name": "underscore refer to"}}, "anli_dev_r1": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.497}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.442}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.456}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.328}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.46}, "template_name": "justified in saying"}}, "anli_dev_r2": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.45}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.382}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.419}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.345}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.41}, "template_name": "justified in saying"}}, "anli_dev_r3": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4558333333333333}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.41333333333333333}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4225}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.305}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4083333333333333}, "template_name": "justified in saying"}}, "story_cloze_2016": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9524318546231961}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9668626402993051}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.7760555852485302}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9583110636023516}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.9593800106894709}, "template_name": "Story Continuation and Options"}}, "super_glue_cb": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.875}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.35714285714285715}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.75}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7678571428571429}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8035714285714286}, "template_name": "justified in saying"}}, "super_glue_copa": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.75}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.87}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.9}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.91}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.91}, "template_name": "plausible_alternatives"}}, "super_glue_rte": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7870036101083032}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8592057761732852}, "template_name": "MNLI crowdsource"}, "does it follow that": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8194945848375451}, "template_name": "does it follow that"}, "guaranteed true": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7942238267148014}, "template_name": "guaranteed true"}, "should assume": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8122743682310469}, "template_name": "should assume"}}, "winogrande_winogrande_xl": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5998421468034728}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5359116022099447}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5864246250986582}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5201262825572218}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5880031570639306}, "template_name": "underscore refer to"}}, "xcopa_id": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.81}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.87}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.83}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.87}, "template_name": "plausible_alternatives"}}, "xcopa_sw": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives"}}, "xcopa_ta": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.7}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.69}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives"}}, "xcopa_vi": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.58}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.81}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.91}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.85}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "plausible_alternatives"}}, "xcopa_zh": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.81}, "template_name": "plausible_alternatives"}}, "xnli_ar": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5578313253012048}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41164658634538154}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5152610441767068}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5803212851405622}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5184738955823294}, "template_name": "justified in saying"}}, "xnli_en": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.6176706827309237}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4606425702811245}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5714859437751004}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.6180722891566265}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5746987951807229}, "template_name": "justified in saying"}}, "xnli_es": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5911646586345382}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43052208835341366}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4397590361445783}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5208835341365462}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41726907630522087}, "template_name": "justified in saying"}}, "xnli_fr": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5911646586345382}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4321285140562249}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5369477911646586}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5176706827309236}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5385542168674698}, "template_name": "justified in saying"}}, "xnli_hi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5208835341365462}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3819277108433735}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44176706827309237}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5253012048192771}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44377510040160645}, "template_name": "justified in saying"}}, "xnli_sw": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5036144578313253}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3887550200803213}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44216867469879517}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38795180722891565}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4397590361445783}, "template_name": "justified in saying"}}, "xnli_ur": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4907630522088353}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37309236947791163}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45863453815261046}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5124497991967871}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45582329317269077}, "template_name": "justified in saying"}}, "xnli_vi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5582329317269076}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42690763052208835}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4759036144578313}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5008032128514056}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4827309236947791}, "template_name": "justified in saying"}}, "xnli_zh": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5550200803212851}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4248995983935743}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43052208835341366}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5526104417670683}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44016064257028115}, "template_name": "justified in saying"}}}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json b/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
deleted file mode 100644
index c17d76a62d827c060ee745ccf9611eac8ff3cdac..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.9524318546231961
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
deleted file mode 100644
index b402b37266757783bf6291bcee1c9cfa46d216f1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.9668626402993051
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Generate_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
deleted file mode 100644
index fb9415efb88e94a410bb8a2473f70078b3feffb3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.7760555852485302
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json b/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
deleted file mode 100644
index 4c6a1159adea4b46e6d68e29b5ccf22f7d3eeded..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.9583110636023516
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json b/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 2a5a5c76a3c4b4526a73cf505aa142660bf95af7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.9593800106894709
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/GPT-3_style/results.json
deleted file mode 100644
index 150551119a34190b84b09a536f5bd1058b09bf1c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.875
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
deleted file mode 100644
index f504eb8fa769b4398bc5bfed5fd0032fbb5b979e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.35714285714285715
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/can_we_infer/results.json
deleted file mode 100644
index 5e4c7c789ae4d0104e023f693f89f383f2d7765c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.75
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 52ea214f6c6b2ebd759e937adf6da63abfc8cc43..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.7678571428571429
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/justified_in_saying/results.json
deleted file mode 100644
index 8d1911dc1d0b22bc55f044765a8435c1f0ca95c2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/cb/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.8035714285714286
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
deleted file mode 100644
index fd07711fac65df94ecc461abbb54e79c4c29e1b1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.75
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/best_option/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/best_option/results.json
deleted file mode 100644
index 5bbfcd2b3a8c506efc556c73a0a3177eef86355c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.87
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/cause_effect/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/cause_effect/results.json
deleted file mode 100644
index 92a200f978839cc7833bf68d793994f49f880572..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.9
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/i_am_hesitating/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
deleted file mode 100644
index 103509431a1ccfac1b771cd1d235bce0d3b70c4b..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.91
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/plausible_alternatives/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
deleted file mode 100644
index 0db9f991a03dbdd962d70f3b63f0ccc795887f73..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.91
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/GPT-3_style/results.json
deleted file mode 100644
index 074738d3a743bb6e89779674116678b7551cfb54..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.7870036101083032
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
deleted file mode 100644
index 306946726d625ff5536f78fcee8d6be028f7a901..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.8592057761732852
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/does_it_follow_that/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
deleted file mode 100644
index 92225b08e90d636f57f4ac20ac58b082a7774712..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "does it follow that",
-  "evaluation": {
-    "accuracy": 0.8194945848375451
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/guaranteed_true/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/guaranteed_true/results.json
deleted file mode 100644
index 13e0a08ce6250b6f6e5a37faa695793aa0bbecb0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/guaranteed_true/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "guaranteed true",
-  "evaluation": {
-    "accuracy": 0.7942238267148014
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/should_assume/results.json b/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/should_assume/results.json
deleted file mode 100644
index 787186a80cc2515fe6118deece5a17d189a47d25..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/super_glue/rte/should_assume/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "should assume",
-  "evaluation": {
-    "accuracy": 0.8122743682310469
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/Replace/results.json b/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
deleted file mode 100644
index 42019f2d1257574dd2ec471f125ef67a3dd6f761..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.5998421468034728
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json b/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
deleted file mode 100644
index 7570b5a289024d5d31ca1e682a269567a07df00f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5359116022099447
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
deleted file mode 100644
index bbe231ea5c0a34aea5c9f156b5736f80199ac088..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5864246250986582
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json b/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
deleted file mode 100644
index 6819315d5fa6359d1ebf74890878512463babfd8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5201262825572218
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json b/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
deleted file mode 100644
index e5cf04d02a842555ba0d5dd39f2b3298ffba8250..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5880031570639306
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
deleted file mode 100644
index a1088c9040872e3ae150c45ed74645051dbe1144..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.56
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/best_option/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/id/best_option/results.json
deleted file mode 100644
index a20d27e3166b1f2c426acae2733282ee425f394d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.81
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/cause_effect/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/id/cause_effect/results.json
deleted file mode 100644
index 3c5e9acce7d4e1f6ae99e36d6c9ca92de41d7e30..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.87
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/i_am_hesitating/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/id/i_am_hesitating/results.json
deleted file mode 100644
index bd8d6933b8ef7729962ea2c239587e9cbd2311ec..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.83
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/plausible_alternatives/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/id/plausible_alternatives/results.json
deleted file mode 100644
index 7a367c68f6181bbe1d1a9acb9f57b9f7228d2561..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/id/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.87
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json
deleted file mode 100644
index 923cf175a2ea8cee40930c011a9e6277f7ec24fb..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.6
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/best_option/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/best_option/results.json
deleted file mode 100644
index a23f938d1d48be3de69b0ba28625fc4bf2b5e7f3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.62
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/cause_effect/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/cause_effect/results.json
deleted file mode 100644
index 72aaf4a87fbe31b7d4b265f0700c6b3d7e1d85e2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.64
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/i_am_hesitating/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/i_am_hesitating/results.json
deleted file mode 100644
index e408b253e1bb3a3ea64e06a37a48df3244969442..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.66
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/plausible_alternatives/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/plausible_alternatives/results.json
deleted file mode 100644
index 1df4cfe999e3301fb63258105a6bd22d6d8a87d0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/sw/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.64
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json
deleted file mode 100644
index 30417333f2d8e1c2f584a1d61725aea60b65c6f1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.59
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/best_option/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/best_option/results.json
deleted file mode 100644
index e2654eb7c7fdcc3225e2b671e77820a685159580..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.66
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/cause_effect/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/cause_effect/results.json
deleted file mode 100644
index 06ed968210883b1677420df46920ae8767ae10d4..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.7
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/i_am_hesitating/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/i_am_hesitating/results.json
deleted file mode 100644
index 7f856249983a23f49cc7c0ff874576f3165782d5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.69
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/plausible_alternatives/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/plausible_alternatives/results.json
deleted file mode 100644
index 57306bd7fddc09ae459ae50228a5d0e5845181e0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/ta/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.64
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json
deleted file mode 100644
index ae38a9d01d5d6923042592213e5294c326c81bf7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.58
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/best_option/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/best_option/results.json
deleted file mode 100644
index b8dc7ac5d1e291253d6c136a15338d56638ad7ea..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.81
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/cause_effect/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/cause_effect/results.json
deleted file mode 100644
index 6e7406329c9039078fb7ae9d2ef72155ae008417..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.91
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/i_am_hesitating/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/i_am_hesitating/results.json
deleted file mode 100644
index f41710a911538208550e160eed17539d59b24bf1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.85
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/plausible_alternatives/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/plausible_alternatives/results.json
deleted file mode 100644
index cf1bea02a25840d8ecb41d39a44242cef297b981..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/vi/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.84
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json
deleted file mode 100644
index a9a4b3b430590a5347eaef9a5d4e361726881045..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.57
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/best_option/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/best_option/results.json
deleted file mode 100644
index aed5c9b07cc6b2a3d46478aea301f538d1b3775a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.84
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/cause_effect/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/cause_effect/results.json
deleted file mode 100644
index d7e9010b77cae2b075cdc0513155deb6d81aa382..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.86
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/i_am_hesitating/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/i_am_hesitating/results.json
deleted file mode 100644
index 8f11039c838151ff3b5a0a2e27634e683ec56a1f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.86
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/plausible_alternatives/results.json b/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/plausible_alternatives/results.json
deleted file mode 100644
index 150f66521abd7106df9da22a6386cb009e460e36..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xcopa/zh/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.81
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ar/GPT-3_style/results.json
deleted file mode 100644
index ff01b7379c993414d086d9db17854a7205ac6cc6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5578313253012048
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json
deleted file mode 100644
index ac2ea9421148ad0ae4106e400778c01975b51947..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.41164658634538154
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ar/can_we_infer/results.json
deleted file mode 100644
index dfcac9ad9cb913582ab0487da22cea8e0d7e8e23..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5152610441767068
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 5b11969b0e13f92bc6a6161eaf64e997249061de..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5803212851405622
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ar/justified_in_saying/results.json
deleted file mode 100644
index efe506af9c6d3273a1318b96a97687fac279971e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ar/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.5184738955823294
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/en/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/en/GPT-3_style/results.json
deleted file mode 100644
index f210a5ed2192d1d6a9b76886e07ad2d5e0b43f00..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/en/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.6176706827309237
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/en/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/en/MNLI_crowdsource/results.json
deleted file mode 100644
index aab3affbd4f922e20a2e7ff49377b5a9db287063..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/en/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.4606425702811245
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/en/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/en/can_we_infer/results.json
deleted file mode 100644
index c81e210fafab735c745e71971539d6067357f32d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/en/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5714859437751004
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 3f6783548974144b51902dcc85cea22068f7b94a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.6180722891566265
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/en/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/en/justified_in_saying/results.json
deleted file mode 100644
index 1c9543f0f76df9a579da6abc88f92b6bdd7e2675..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/en/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.5746987951807229
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/es/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/es/GPT-3_style/results.json
deleted file mode 100644
index 1d004908cc470c9648fe66b283ad455e1b834ef0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/es/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5911646586345382
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/es/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/es/MNLI_crowdsource/results.json
deleted file mode 100644
index 121a0c50ffb27924c389db1d5d51037b5d6f5491..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/es/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.43052208835341366
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/es/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/es/can_we_infer/results.json
deleted file mode 100644
index 20d1691995a8a5433ecb240cb1c3d03746c2a51f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/es/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.4397590361445783
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 49b23b559b781c83bad56c90e5174f1e0b28f1f8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5208835341365462
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/es/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/es/justified_in_saying/results.json
deleted file mode 100644
index afeca9dbd848afc14b8b19d223b64f7d5efaaeaa..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/es/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.41726907630522087
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/fr/GPT-3_style/results.json
deleted file mode 100644
index 98dfab12ef834b10246c6965b6193ecddadd61e6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5911646586345382
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json
deleted file mode 100644
index 45c5d22e3bf400249196b61eeecaa7da720505e5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.4321285140562249
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/fr/can_we_infer/results.json
deleted file mode 100644
index 77d6f700093348996e1d24295849682c1ee5ac64..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5369477911646586
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 88f476e6a408c2d37aa3e4e651313e71bc7f1b3f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5176706827309236
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/fr/justified_in_saying/results.json
deleted file mode 100644
index e4bd8c269d5762e1069423690ccdde27e4e56e3b..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/fr/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.5385542168674698
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/hi/GPT-3_style/results.json
deleted file mode 100644
index 90d22bda903ba93a0e0324665ec828fc885d8156..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5208835341365462
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json
deleted file mode 100644
index 0b58e1ba7eec12668ad871080febcfaf9860649f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.3819277108433735
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/hi/can_we_infer/results.json
deleted file mode 100644
index b58a33fd1887c977c1b06221f564dbe217c2e539..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.44176706827309237
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json
deleted file mode 100644
index c768d703f49038fc3b9b146f5f22fe421746912a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5253012048192771
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/hi/justified_in_saying/results.json
deleted file mode 100644
index 176c133af7bf8b355f7e2b4214c435c7b241de40..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/hi/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.44377510040160645
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/sw/GPT-3_style/results.json
deleted file mode 100644
index 745c75053efd47e267b816d7fc9ef9059b9af430..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5036144578313253
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json
deleted file mode 100644
index 1fbf12e7a9737e4355d54e58ae5db9d89f701b93..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.3887550200803213
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/sw/can_we_infer/results.json
deleted file mode 100644
index c04f874fe0f58a56b90ffb038bee36b10fbb2b8c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.44216867469879517
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json
deleted file mode 100644
index a1a771df194005c2f9a30786e341de8e9a310609..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.38795180722891565
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/sw/justified_in_saying/results.json
deleted file mode 100644
index b06b5ac8c564d4da30284d6f8d545022a192b420..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/sw/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4397590361445783
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ur/GPT-3_style/results.json
deleted file mode 100644
index 361ed448d271cdef8b6876d50d3e9500e535bcc8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.4907630522088353
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json
deleted file mode 100644
index d09364d03f5460b06357537390716479b588a719..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.37309236947791163
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ur/can_we_infer/results.json
deleted file mode 100644
index 55616a19ddbd889cc33255b73fdc8fa6b91e68ec..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.45863453815261046
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json
deleted file mode 100644
index e4d0ae8991d72c90be89ca5f763e817b03ac05d5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5124497991967871
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/ur/justified_in_saying/results.json
deleted file mode 100644
index eb743b1f0c2c178dec6881a3be1d5336589cbfa1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/ur/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.45582329317269077
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/vi/GPT-3_style/results.json
deleted file mode 100644
index e77b5f3004afba26184222f7a45fadcaabd63989..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5582329317269076
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json
deleted file mode 100644
index f84b726698ca47e601a1aa4e3896d0ee7185e510..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.42690763052208835
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/vi/can_we_infer/results.json
deleted file mode 100644
index 4fd391f132bfedf7cd39efd2b6656b564d343458..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.4759036144578313
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json
deleted file mode 100644
index ab70dc2fca42dab93474d9a5f5432d1bc74968ff..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5008032128514056
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/vi/justified_in_saying/results.json
deleted file mode 100644
index 5186d53e6ea41cde8f8a201fa2699a49eb570ba5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/vi/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4827309236947791
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/GPT-3_style/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/zh/GPT-3_style/results.json
deleted file mode 100644
index 5fbaef1325c16e12a77c3c23abd166c6fa411e1c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5550200803212851
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json
deleted file mode 100644
index 1b2ff42f6dc33b2fb66c0047bacea591578bf91e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.4248995983935743
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/can_we_infer/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/zh/can_we_infer/results.json
deleted file mode 100644
index 82d063dec3853aa0ea836bb0c48c95984a99f234..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.43052208835341366
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 18da408e415812cc705f7f27ba28ee3069c8b85a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5526104417670683
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/justified_in_saying/results.json b/evaluation_bloomz-mt/evaluation_l1/xnli/zh/justified_in_saying/results.json
deleted file mode 100644
index f668b2453fefbc4459efecb5e12ee1831705740c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_l1/xnli/zh/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.44016064257028115
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
deleted file mode 100644
index 08eaf8d55649a67760962798f16115077603ea06..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Answer Given options_armt",
-  "evaluation": {
-    "accuracy": 0.8941098610191925
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
deleted file mode 100644
index 773f18b6f44483c2b276c74fa8dba3a73173f336..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Choose Story Ending_armt",
-  "evaluation": {
-    "accuracy": 0.9404367968232958
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
deleted file mode 100644
index 1019f9b10c16be41066096a733d486afaec79fe9..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Generate Ending_armt",
-  "evaluation": {
-    "accuracy": 0.6598279285241562
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
deleted file mode 100644
index 2ef0ca9ebf98a076846d9f2cfc5e299323efb0ce..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Novel Correct Ending_armt",
-  "evaluation": {
-    "accuracy": 0.9272005294506949
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
deleted file mode 100644
index 8a8a2cfc67d927bf9dd38579780168915877b79f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Story Continuation and Options_armt",
-  "evaluation": {
-    "accuracy": 0.9172733289212442
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
deleted file mode 100644
index 9346b5f9fe2b425df3ba7ae90573ef5c7e7fb3d1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Answer Given options_esmt",
-  "evaluation": {
-    "accuracy": 0.9311714096624751
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
deleted file mode 100644
index 0ecff6e31deb89478ba84a5dc3a59031dc4c7704..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Choose Story Ending_esmt",
-  "evaluation": {
-    "accuracy": 0.9549966909331569
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
deleted file mode 100644
index 42e4c5650dd6e7becbb01e1846ed5287011574a3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Generate Ending_esmt",
-  "evaluation": {
-    "accuracy": 0.7405691594970218
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
deleted file mode 100644
index 2d175315a4a49d7ffbc1c794488f2c4170c72aa3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Novel Correct Ending_esmt",
-  "evaluation": {
-    "accuracy": 0.9490403706154864
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
deleted file mode 100644
index a92049818a18be1bd1dc53949dd5f5dab08bb5b6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Story Continuation and Options_esmt",
-  "evaluation": {
-    "accuracy": 0.9523494374586366
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
deleted file mode 100644
index 82077be9a713eba8286c85e8f7a54f405f67493c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Answer Given options_eumt",
-  "evaluation": {
-    "accuracy": 0.7326273990734613
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
deleted file mode 100644
index 6ffff53ee20de1422ddc411de36952d95f5c26df..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Choose Story Ending_eumt",
-  "evaluation": {
-    "accuracy": 0.8682991396426207
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
deleted file mode 100644
index 11f291c388061a9a54cebf327d928b0e57b3b414..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Generate Ending_eumt",
-  "evaluation": {
-    "accuracy": 0.6293845135671741
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
deleted file mode 100644
index 6f8a03f86ea1eb3fed92e88b02a7d03e38c1a0a2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Novel Correct Ending_eumt",
-  "evaluation": {
-    "accuracy": 0.8305757776307081
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
deleted file mode 100644
index 2e3bb84f6e97d3686dfadd1dd1241be67e255127..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Story Continuation and Options_eumt",
-  "evaluation": {
-    "accuracy": 0.8259430840502978
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
deleted file mode 100644
index 9f600887dc434b8634d25cb214fceaeb6670e60e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Answer Given options_himt",
-  "evaluation": {
-    "accuracy": 0.8530774321641297
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
deleted file mode 100644
index 5d10e8076e863899177eb48671f8f8510760cabc..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Choose Story Ending_himt",
-  "evaluation": {
-    "accuracy": 0.8914626075446724
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
deleted file mode 100644
index 748d79306b080fe587762245d865188c38858bdb..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Generate Ending_himt",
-  "evaluation": {
-    "accuracy": 0.6644606221045665
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
deleted file mode 100644
index 12f20fab7fb8dfe1bed8c7d32adaf44c74a8ea79..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Novel Correct Ending_himt",
-  "evaluation": {
-    "accuracy": 0.8821972203838517
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
deleted file mode 100644
index 57a270e97d7e8a4d2a480b22824e9bd101b5c200..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Story Continuation and Options_himt",
-  "evaluation": {
-    "accuracy": 0.8735936465916612
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
deleted file mode 100644
index 7455cbf4b0e4932b109dabdb981ceca022d78048..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Answer Given options_idmt",
-  "evaluation": {
-    "accuracy": 0.8682991396426207
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
deleted file mode 100644
index 7ac777f4ceac3494e1523f45ddbe7948803f95df..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Choose Story Ending_idmt",
-  "evaluation": {
-    "accuracy": 0.927862342819325
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
deleted file mode 100644
index d23d62b87eebcf899b1a5e9822834ce0ab500ae3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Generate Ending_idmt",
-  "evaluation": {
-    "accuracy": 0.6929185969556585
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
deleted file mode 100644
index 349fb09ae9669f153cee1b2fafd8a3454cfcee50..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Novel Correct Ending_idmt",
-  "evaluation": {
-    "accuracy": 0.9086697551290536
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
deleted file mode 100644
index 07b42403a63a6e9a0d9cd305f9d626b80f2cb0dd..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Story Continuation and Options_idmt",
-  "evaluation": {
-    "accuracy": 0.9159497021839841
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
deleted file mode 100644
index 622a2b5c90190978f50caa3771721da393dc5e48..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Answer Given options_zhmt",
-  "evaluation": {
-    "accuracy": 0.913964262078094
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
deleted file mode 100644
index 74d7933a087ca4d3b33932b27692f9a149c889c7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Choose Story Ending_zhmt",
-  "evaluation": {
-    "accuracy": 0.9238914626075446
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
deleted file mode 100644
index c7d07f0bc402a8bc64659f7ad0d662a11563f472..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Generate Ending_zhmt",
-  "evaluation": {
-    "accuracy": 0.6843150231634679
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
deleted file mode 100644
index f2cc73ab58a67c8e296e4d941608f9ee4ddceed1..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Novel Correct Ending_zhmt",
-  "evaluation": {
-    "accuracy": 0.9252150893448048
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
deleted file mode 100644
index 3cccd12bcf6b155e7860a35ac58b67635ec70895..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Story Continuation and Options_zhmt",
-  "evaluation": {
-    "accuracy": 0.913302448709464
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
deleted file mode 100644
index 22618bc9f7b3bfe018197f506330b9ac0ffcf243..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "Replace_frmt",
-  "evaluation": {
-    "accuracy": 0.6626506024096386
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
deleted file mode 100644
index 081254c848e81d983b60c29b734e4e228f88cff5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "True or False_frmt",
-  "evaluation": {
-    "accuracy": 0.4578313253012048
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
deleted file mode 100644
index 4c1ee4a09d6ceaaba5fa1afaa5d2b69d1682d596..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "does underscore refer to_frmt",
-  "evaluation": {
-    "accuracy": 0.5783132530120482
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
deleted file mode 100644
index 3cf0b9d6d126b11ffd3d6820319d25b229d8cbdb..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "stand for_frmt",
-  "evaluation": {
-    "accuracy": 0.5421686746987951
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
deleted file mode 100644
index 5d4c32dc9b18cbb25f767896325a96dbae2516d8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "underscore refer to_frmt",
-  "evaluation": {
-    "accuracy": 0.6265060240963856
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
deleted file mode 100644
index e34759d77b70ee482aade2ca041c153ed8b79e63..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "Replace_ptmt",
-  "evaluation": {
-    "accuracy": 0.6273764258555133
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
deleted file mode 100644
index 69542cc44b7cff9aeb10f59d8455b7853479e3d3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "True or False_ptmt",
-  "evaluation": {
-    "accuracy": 0.532319391634981
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
deleted file mode 100644
index 44700ff9b05ead1e066bcce7df9fa17041b6f8f2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "does underscore refer to_ptmt",
-  "evaluation": {
-    "accuracy": 0.596958174904943
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
deleted file mode 100644
index 5fdda9ecd6b979210133cd38a86a29e9aa3bab1f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "stand for_ptmt",
-  "evaluation": {
-    "accuracy": 0.5399239543726235
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
deleted file mode 100644
index 6e2aaeea1b9fb349a7ff9e70ec10f10dca34a10c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "underscore refer to_ptmt",
-  "evaluation": {
-    "accuracy": 0.623574144486692
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
deleted file mode 100644
index 7e97551910c6ec94f2103fe0bb0d23e98453bb09..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "Replace_zhmt",
-  "evaluation": {
-    "accuracy": 0.7202380952380952
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
deleted file mode 100644
index 3a5fe840ade7d5406e9be3869192447224fe6c64..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "True or False_zhmt",
-  "evaluation": {
-    "accuracy": 0.5099206349206349
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
deleted file mode 100644
index 05a41bc50f8a282e2b75434bb1662975be85e479..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "does underscore refer to_zhmt",
-  "evaluation": {
-    "accuracy": 0.6746031746031746
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
deleted file mode 100644
index 2e81e46585f08d06cfa91467de512a7b59dced33..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "stand for_zhmt",
-  "evaluation": {
-    "accuracy": 0.5654761904761905
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
deleted file mode 100644
index d26b81ae8659f7cc0d23004b1dd7982239f52469..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "underscore refer to_zhmt",
-  "evaluation": {
-    "accuracy": 0.7638888888888888
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/merged.csv b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/merged.csv
deleted file mode 100644
index 52bd41e5c505fd7c47e2c1749f8939257d057fec..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/merged.csv
+++ /dev/null
@@ -1,86 +0,0 @@
-dataset,prompt,metric,value
-xcopa_id,C1 or C2? premise_idmt,accuracy,0.57
-xcopa_id,best_option_idmt,accuracy,0.78
-xcopa_id,cause_effect_idmt,accuracy,0.84
-xcopa_id,i_am_hesitating_idmt,accuracy,0.84
-xcopa_id,plausible_alternatives_idmt,accuracy,0.83
-xcopa_id,median,accuracy,0.83
-xcopa_sw,C1 or C2? premise_swmt,accuracy,0.6
-xcopa_sw,best_option_swmt,accuracy,0.59
-xcopa_sw,cause_effect_swmt,accuracy,0.63
-xcopa_sw,i_am_hesitating_swmt,accuracy,0.67
-xcopa_sw,plausible_alternatives_swmt,accuracy,0.62
-xcopa_sw,median,accuracy,0.62
-xcopa_ta,C1 or C2? premise_tamt,accuracy,0.64
-xcopa_ta,best_option_tamt,accuracy,0.56
-xcopa_ta,cause_effect_tamt,accuracy,0.62
-xcopa_ta,i_am_hesitating_tamt,accuracy,0.64
-xcopa_ta,plausible_alternatives_tamt,accuracy,0.63
-xcopa_ta,median,accuracy,0.63
-xcopa_vi,C1 or C2? premise_vimt,accuracy,0.61
-xcopa_vi,best_option_vimt,accuracy,0.77
-xcopa_vi,cause_effect_vimt,accuracy,0.89
-xcopa_vi,i_am_hesitating_vimt,accuracy,0.85
-xcopa_vi,plausible_alternatives_vimt,accuracy,0.87
-xcopa_vi,median,accuracy,0.85
-xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.63
-xcopa_zh,best_option_zhmt,accuracy,0.75
-xcopa_zh,cause_effect_zhmt,accuracy,0.83
-xcopa_zh,i_am_hesitating_zhmt,accuracy,0.84
-xcopa_zh,plausible_alternatives_zhmt,accuracy,0.86
-xcopa_zh,median,accuracy,0.83
-xstory_cloze_ar,Answer Given options_armt,accuracy,0.8941098610191925
-xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.9404367968232958
-xstory_cloze_ar,Generate Ending_armt,accuracy,0.6598279285241562
-xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.9272005294506949
-xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.9172733289212442
-xstory_cloze_ar,median,accuracy,0.9172733289212442
-xstory_cloze_es,Answer Given options_esmt,accuracy,0.9311714096624751
-xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.9549966909331569
-xstory_cloze_es,Generate Ending_esmt,accuracy,0.7405691594970218
-xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.9490403706154864
-xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.9523494374586366
-xstory_cloze_es,median,accuracy,0.9490403706154864
-xstory_cloze_eu,Answer Given options_eumt,accuracy,0.7326273990734613
-xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.8682991396426207
-xstory_cloze_eu,Generate Ending_eumt,accuracy,0.6293845135671741
-xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.8305757776307081
-xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.8259430840502978
-xstory_cloze_eu,median,accuracy,0.8259430840502978
-xstory_cloze_hi,Answer Given options_himt,accuracy,0.8530774321641297
-xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.8914626075446724
-xstory_cloze_hi,Generate Ending_himt,accuracy,0.6644606221045665
-xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.8821972203838517
-xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.8735936465916612
-xstory_cloze_hi,median,accuracy,0.8735936465916612
-xstory_cloze_id,Answer Given options_idmt,accuracy,0.8682991396426207
-xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.927862342819325
-xstory_cloze_id,Generate Ending_idmt,accuracy,0.6929185969556585
-xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.9086697551290536
-xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.9159497021839841
-xstory_cloze_id,median,accuracy,0.9086697551290536
-xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.913964262078094
-xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.9238914626075446
-xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.6843150231634679
-xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.9252150893448048
-xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.913302448709464
-xstory_cloze_zh,median,accuracy,0.913964262078094
-xwinograd_fr,Replace_frmt,accuracy,0.6626506024096386
-xwinograd_fr,True or False_frmt,accuracy,0.4578313253012048
-xwinograd_fr,does underscore refer to_frmt,accuracy,0.5783132530120482
-xwinograd_fr,stand for_frmt,accuracy,0.5421686746987951
-xwinograd_fr,underscore refer to_frmt,accuracy,0.6265060240963856
-xwinograd_fr,median,accuracy,0.5783132530120482
-xwinograd_pt,Replace_ptmt,accuracy,0.6273764258555133
-xwinograd_pt,True or False_ptmt,accuracy,0.532319391634981
-xwinograd_pt,does underscore refer to_ptmt,accuracy,0.596958174904943
-xwinograd_pt,stand for_ptmt,accuracy,0.5399239543726235
-xwinograd_pt,underscore refer to_ptmt,accuracy,0.623574144486692
-xwinograd_pt,median,accuracy,0.596958174904943
-xwinograd_zh,Replace_zhmt,accuracy,0.7202380952380952
-xwinograd_zh,True or False_zhmt,accuracy,0.5099206349206349
-xwinograd_zh,does underscore refer to_zhmt,accuracy,0.6746031746031746
-xwinograd_zh,stand for_zhmt,accuracy,0.5654761904761905
-xwinograd_zh,underscore refer to_zhmt,accuracy,0.7638888888888888
-xwinograd_zh,median,accuracy,0.6746031746031746
-multiple,average,multiple,0.7855970749932859
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/merged.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/merged.json
deleted file mode 100644
index 6ba79edf768cf12f26f0a9792c6b293b2ffeffc7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"Muennighoff/xstory_cloze_ar": {"Answer Given options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8941098610191925}, "template_name": "Answer Given options_armt"}, "Choose Story Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9404367968232958}, "template_name": "Choose Story Ending_armt"}, "Generate Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6598279285241562}, "template_name": "Generate Ending_armt"}, "Novel Correct Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9272005294506949}, "template_name": "Novel Correct Ending_armt"}, "Story Continuation and Options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9172733289212442}, "template_name": "Story Continuation and Options_armt"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9311714096624751}, "template_name": "Answer Given options_esmt"}, "Choose Story Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9549966909331569}, "template_name": "Choose Story Ending_esmt"}, "Generate Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7405691594970218}, "template_name": "Generate Ending_esmt"}, "Novel Correct Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9490403706154864}, "template_name": "Novel Correct Ending_esmt"}, "Story Continuation and Options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9523494374586366}, "template_name": "Story Continuation and Options_esmt"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7326273990734613}, "template_name": "Answer Given options_eumt"}, "Choose Story Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8682991396426207}, "template_name": "Choose Story Ending_eumt"}, "Generate Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6293845135671741}, "template_name": "Generate Ending_eumt"}, "Novel Correct Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8305757776307081}, "template_name": "Novel Correct Ending_eumt"}, "Story Continuation and Options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8259430840502978}, "template_name": "Story Continuation and Options_eumt"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8530774321641297}, "template_name": "Answer Given options_himt"}, "Choose Story Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8914626075446724}, "template_name": "Choose Story Ending_himt"}, "Generate Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6644606221045665}, "template_name": "Generate Ending_himt"}, "Novel Correct Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8821972203838517}, "template_name": "Novel Correct Ending_himt"}, "Story Continuation and Options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8735936465916612}, "template_name": "Story Continuation and Options_himt"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8682991396426207}, "template_name": "Answer Given options_idmt"}, "Choose Story Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.927862342819325}, "template_name": "Choose Story Ending_idmt"}, "Generate Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6929185969556585}, "template_name": "Generate Ending_idmt"}, "Novel Correct Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9086697551290536}, "template_name": "Novel Correct Ending_idmt"}, "Story Continuation and Options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9159497021839841}, "template_name": "Story Continuation and Options_idmt"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.913964262078094}, "template_name": "Answer Given options_zhmt"}, "Choose Story Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9238914626075446}, "template_name": "Choose Story Ending_zhmt"}, "Generate Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6843150231634679}, "template_name": "Generate Ending_zhmt"}, "Novel Correct Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9252150893448048}, "template_name": "Novel Correct Ending_zhmt"}, "Story Continuation and Options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.913302448709464}, "template_name": "Story Continuation and Options_zhmt"}}, "Muennighoff/xwinograd_fr": {"Replace_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6626506024096386}, "template_name": "Replace_frmt"}, "True or False_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4578313253012048}, "template_name": "True or False_frmt"}, "does underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5783132530120482}, "template_name": "does underscore refer to_frmt"}, "stand for_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5421686746987951}, "template_name": "stand for_frmt"}, "underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6265060240963856}, "template_name": "underscore refer to_frmt"}}, "Muennighoff/xwinograd_pt": {"Replace_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6273764258555133}, "template_name": "Replace_ptmt"}, "True or False_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.532319391634981}, "template_name": "True or False_ptmt"}, "does underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.596958174904943}, "template_name": "does underscore refer to_ptmt"}, "stand for_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5399239543726235}, "template_name": "stand for_ptmt"}, "underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.623574144486692}, "template_name": "underscore refer to_ptmt"}}, "Muennighoff/xwinograd_zh": {"Replace_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7202380952380952}, "template_name": "Replace_zhmt"}, "True or False_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5099206349206349}, "template_name": "True or False_zhmt"}, "does underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6746031746031746}, "template_name": "does underscore refer to_zhmt"}, "stand for_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5654761904761905}, "template_name": "stand for_zhmt"}, "underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7638888888888888}, "template_name": "underscore refer to_zhmt"}}, "xcopa_id": {"C1 or C2? premise_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "C1 or C2? premise_idmt"}, "best_option_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.78}, "template_name": "best_option_idmt"}, "cause_effect_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "cause_effect_idmt"}, "i_am_hesitating_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "i_am_hesitating_idmt"}, "plausible_alternatives_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.83}, "template_name": "plausible_alternatives_idmt"}}, "xcopa_sw": {"C1 or C2? premise_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_swmt"}, "best_option_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "best_option_swmt"}, "cause_effect_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "cause_effect_swmt"}, "i_am_hesitating_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "i_am_hesitating_swmt"}, "plausible_alternatives_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "plausible_alternatives_swmt"}}, "xcopa_ta": {"C1 or C2? premise_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "C1 or C2? premise_tamt"}, "best_option_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "best_option_tamt"}, "cause_effect_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "cause_effect_tamt"}, "i_am_hesitating_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "i_am_hesitating_tamt"}, "plausible_alternatives_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "plausible_alternatives_tamt"}}, "xcopa_vi": {"C1 or C2? premise_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "C1 or C2? premise_vimt"}, "best_option_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "best_option_vimt"}, "cause_effect_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.89}, "template_name": "cause_effect_vimt"}, "i_am_hesitating_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.85}, "template_name": "i_am_hesitating_vimt"}, "plausible_alternatives_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.87}, "template_name": "plausible_alternatives_vimt"}}, "xcopa_zh": {"C1 or C2? premise_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "C1 or C2? premise_zhmt"}, "best_option_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.75}, "template_name": "best_option_zhmt"}, "cause_effect_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.83}, "template_name": "cause_effect_zhmt"}, "i_am_hesitating_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "i_am_hesitating_zhmt"}, "plausible_alternatives_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "plausible_alternatives_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json
deleted file mode 100644
index a3b159b6975c5a2a3ba1704d4a114cff18e67241..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "C1 or C2? premise_idmt",
-  "evaluation": {
-    "accuracy": 0.57
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json
deleted file mode 100644
index 855f53dfb43806d722065ba2cfa8ca448756a2cb..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "best_option_idmt",
-  "evaluation": {
-    "accuracy": 0.78
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json
deleted file mode 100644
index c68da0c2acb059fa1add54ef45470d452c3e9c07..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "cause_effect_idmt",
-  "evaluation": {
-    "accuracy": 0.84
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json
deleted file mode 100644
index bc354e5e99e62eebea0b523de3823573281b5791..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "i_am_hesitating_idmt",
-  "evaluation": {
-    "accuracy": 0.84
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json
deleted file mode 100644
index 56547788a32feb5d2b997d3590f637dae6eb98dc..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "plausible_alternatives_idmt",
-  "evaluation": {
-    "accuracy": 0.83
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
deleted file mode 100644
index ece2dd50d4d67e426e5e519747b1841c335c8c2c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "C1 or C2? premise_swmt",
-  "evaluation": {
-    "accuracy": 0.6
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json
deleted file mode 100644
index 1dd7fd8221b9ed98ae38d201ec20eb69fd81e899..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "best_option_swmt",
-  "evaluation": {
-    "accuracy": 0.59
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json
deleted file mode 100644
index 8eb58372c78dd53c849b27c08b98c3e9c6d8eaad..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "cause_effect_swmt",
-  "evaluation": {
-    "accuracy": 0.63
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json
deleted file mode 100644
index e803205adcfefba9f30d43af3290e2004e39e3aa..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "i_am_hesitating_swmt",
-  "evaluation": {
-    "accuracy": 0.67
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json
deleted file mode 100644
index 51f2499b03c23ca354dfe8be026a5d87885a7929..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "plausible_alternatives_swmt",
-  "evaluation": {
-    "accuracy": 0.62
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
deleted file mode 100644
index f3ef16661ff501e41d7fb1a38c1aa3bb5289d109..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "C1 or C2? premise_tamt",
-  "evaluation": {
-    "accuracy": 0.64
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json
deleted file mode 100644
index 947f7537af74f40d992e21e57b4e0b3031461954..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "best_option_tamt",
-  "evaluation": {
-    "accuracy": 0.56
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json
deleted file mode 100644
index 8b22c4ea50f2cbf069ce00e68da7ce949e4b60a3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "cause_effect_tamt",
-  "evaluation": {
-    "accuracy": 0.62
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json
deleted file mode 100644
index a246afec199aa339abb27753067bd74ff6298781..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "i_am_hesitating_tamt",
-  "evaluation": {
-    "accuracy": 0.64
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json
deleted file mode 100644
index 010f33175cc3862712980ab18b2b8cacda387d00..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "plausible_alternatives_tamt",
-  "evaluation": {
-    "accuracy": 0.63
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
deleted file mode 100644
index a11e4478e8d0d68f2b1da5f504c2412c75fdba56..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "C1 or C2? premise_vimt",
-  "evaluation": {
-    "accuracy": 0.61
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json
deleted file mode 100644
index a4969d11187ca6c988435275f85d713f14da8ba7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "best_option_vimt",
-  "evaluation": {
-    "accuracy": 0.77
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json
deleted file mode 100644
index 3f3e2a8ef414dda19185126456aba80cc3c9dd64..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "cause_effect_vimt",
-  "evaluation": {
-    "accuracy": 0.89
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json
deleted file mode 100644
index c4786bd77a9ec21e35c1b62f2ec5a5a548975afe..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "i_am_hesitating_vimt",
-  "evaluation": {
-    "accuracy": 0.85
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json
deleted file mode 100644
index 45f092dd4589031c1a20671997c0f94b75e80371..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "plausible_alternatives_vimt",
-  "evaluation": {
-    "accuracy": 0.87
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
deleted file mode 100644
index 6fa79011c4d9ffa2bebb87d9c24606bc43b1d35d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "C1 or C2? premise_zhmt",
-  "evaluation": {
-    "accuracy": 0.63
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json
deleted file mode 100644
index 4b69455120fb83b58f3f2d20dc0a6de9499b8c25..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "best_option_zhmt",
-  "evaluation": {
-    "accuracy": 0.75
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json
deleted file mode 100644
index 0d45d6dec2e02a88c4efae655983cf388b732b83..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "cause_effect_zhmt",
-  "evaluation": {
-    "accuracy": 0.83
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json
deleted file mode 100644
index 7f0f51c3e84c028612690851eafd25fa8693114c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "i_am_hesitating_zhmt",
-  "evaluation": {
-    "accuracy": 0.84
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json
deleted file mode 100644
index 6e0ebbeb1c92528c677e0c3041a5520d00eb246a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "plausible_alternatives_zhmt",
-  "evaluation": {
-    "accuracy": 0.86
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json
deleted file mode 100644
index 1151e88244939e66270073875e6352cbcb194777..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "GPT-3 style_arht",
-  "evaluation": {
-    "accuracy": 0.40441767068273093
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json
deleted file mode 100644
index 3e50548b0ae0c7bda585ddb4ea99660d84f1f86a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "MNLI crowdsource_arht",
-  "evaluation": {
-    "accuracy": 0.43012048192771085
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json
deleted file mode 100644
index 0d4880853137e81090cda44fc001e9425693628e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "can we infer_arht",
-  "evaluation": {
-    "accuracy": 0.3610441767068273
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json
deleted file mode 100644
index 43aade15d507b7981f09fef192ecbd4de817e416..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "guaranteed/possible/impossible_arht",
-  "evaluation": {
-    "accuracy": 0.3642570281124498
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json
deleted file mode 100644
index d7673b1d800b7e181915339d05873c3f4f9046b7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "justified in saying_arht",
-  "evaluation": {
-    "accuracy": 0.37309236947791163
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json
deleted file mode 100644
index c0ab7199857150c8bdbf32568948c735ac767b6d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "GPT-3 style_esht",
-  "evaluation": {
-    "accuracy": 0.5698795180722892
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json
deleted file mode 100644
index c71d79218ce58a8e449580613f75a4c2e19e1e8e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "MNLI crowdsource_esht",
-  "evaluation": {
-    "accuracy": 0.342570281124498
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json
deleted file mode 100644
index 9e42be28ecd114e6f6f4d88689f566e7a5aab611..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "can we infer_esht",
-  "evaluation": {
-    "accuracy": 0.46546184738955826
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json
deleted file mode 100644
index 0c99dceb99ba9331895c687b2c4e5f3a7f92578e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "guaranteed/possible/impossible_esht",
-  "evaluation": {
-    "accuracy": 0.5526104417670683
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json
deleted file mode 100644
index fb77a7a139dd8b9c24ea015e0d02823933a7e16a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "justified in saying_esht",
-  "evaluation": {
-    "accuracy": 0.4321285140562249
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json
deleted file mode 100644
index 4b13c026436426907cc26ba42d716088ce4b59b4..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "GPT-3 style_frht",
-  "evaluation": {
-    "accuracy": 0.4995983935742972
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json
deleted file mode 100644
index 7d7e02d1a9f66af72a883aa01d74e2dde9197e98..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "MNLI crowdsource_frht",
-  "evaluation": {
-    "accuracy": 0.4004016064257028
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json
deleted file mode 100644
index a407e7ae562371e6260c90b1ea0a3c9107aa37f7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "can we infer_frht",
-  "evaluation": {
-    "accuracy": 0.5694779116465863
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json
deleted file mode 100644
index aa655ebc807cb20ba71577ff3194250f5616a1f3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "guaranteed/possible/impossible_frht",
-  "evaluation": {
-    "accuracy": 0.5152610441767068
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json
deleted file mode 100644
index 014ade2a465f303693ffe5cab94fbdd643774842..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "justified in saying_frht",
-  "evaluation": {
-    "accuracy": 0.5493975903614458
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json
deleted file mode 100644
index a4e92a96761a9a536d4bb8a5e4ba4f4061b7f539..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "GPT-3 style_hiht",
-  "evaluation": {
-    "accuracy": 0.44417670682730925
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json
deleted file mode 100644
index 8efef72cd5d3fc77ef9b4366770aaae329a5fb7a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "MNLI crowdsource_hiht",
-  "evaluation": {
-    "accuracy": 0.5236947791164659
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json
deleted file mode 100644
index 778e69f3893a372b47a6fbdc28161f775335d96c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "can we infer_hiht",
-  "evaluation": {
-    "accuracy": 0.4963855421686747
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json
deleted file mode 100644
index 1ebc1461a52c3cde14887dca4097c766c74b2fbb..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "guaranteed/possible/impossible_hiht",
-  "evaluation": {
-    "accuracy": 0.4493975903614458
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json
deleted file mode 100644
index d4e84c9aaefa0f61a8a742fc3d8391391085d071..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "justified in saying_hiht",
-  "evaluation": {
-    "accuracy": 0.4963855421686747
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/merged.csv b/evaluation_bloomz-mt/evaluation_xnliht/xnli/merged.csv
deleted file mode 100644
index 38974d265a2b337eb24fd64b0a27d6af10f7d5c6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/merged.csv
+++ /dev/null
@@ -1,50 +0,0 @@
-dataset,prompt,metric,value
-xnli_ar,GPT-3 style_arht,accuracy,0.40441767068273093
-xnli_ar,MNLI crowdsource_arht,accuracy,0.43012048192771085
-xnli_ar,can we infer_arht,accuracy,0.3610441767068273
-xnli_ar,guaranteed/possible/impossible_arht,accuracy,0.3642570281124498
-xnli_ar,justified in saying_arht,accuracy,0.37309236947791163
-xnli_ar,median,accuracy,0.37309236947791163
-xnli_es,GPT-3 style_esht,accuracy,0.5698795180722892
-xnli_es,MNLI crowdsource_esht,accuracy,0.342570281124498
-xnli_es,can we infer_esht,accuracy,0.46546184738955826
-xnli_es,guaranteed/possible/impossible_esht,accuracy,0.5526104417670683
-xnli_es,justified in saying_esht,accuracy,0.4321285140562249
-xnli_es,median,accuracy,0.46546184738955826
-xnli_fr,GPT-3 style_frht,accuracy,0.4995983935742972
-xnli_fr,MNLI crowdsource_frht,accuracy,0.4004016064257028
-xnli_fr,can we infer_frht,accuracy,0.5694779116465863
-xnli_fr,guaranteed/possible/impossible_frht,accuracy,0.5152610441767068
-xnli_fr,justified in saying_frht,accuracy,0.5493975903614458
-xnli_fr,median,accuracy,0.5152610441767068
-xnli_hi,GPT-3 style_hiht,accuracy,0.44417670682730925
-xnli_hi,MNLI crowdsource_hiht,accuracy,0.5236947791164659
-xnli_hi,can we infer_hiht,accuracy,0.4963855421686747
-xnli_hi,guaranteed/possible/impossible_hiht,accuracy,0.4493975903614458
-xnli_hi,justified in saying_hiht,accuracy,0.4963855421686747
-xnli_hi,median,accuracy,0.4963855421686747
-xnli_sw,GPT-3 style_swht,accuracy,0.39397590361445783
-xnli_sw,MNLI crowdsource_swht,accuracy,0.3329317269076305
-xnli_sw,can we infer_swht,accuracy,0.4285140562248996
-xnli_sw,guaranteed/possible/impossible_swht,accuracy,0.38433734939759034
-xnli_sw,justified in saying_swht,accuracy,0.41967871485943775
-xnli_sw,median,accuracy,0.39397590361445783
-xnli_ur,GPT-3 style_urht,accuracy,0.463855421686747
-xnli_ur,MNLI crowdsource_urht,accuracy,0.40441767068273093
-xnli_ur,can we infer_urht,accuracy,0.3895582329317269
-xnli_ur,guaranteed/possible/impossible_urht,accuracy,0.3405622489959839
-xnli_ur,justified in saying_urht,accuracy,0.43293172690763054
-xnli_ur,median,accuracy,0.40441767068273093
-xnli_vi,GPT-3 style_viht,accuracy,0.5261044176706827
-xnli_vi,MNLI crowdsource_viht,accuracy,0.39879518072289155
-xnli_vi,can we infer_viht,accuracy,0.5481927710843374
-xnli_vi,guaranteed/possible/impossible_viht,accuracy,0.43694779116465865
-xnli_vi,justified in saying_viht,accuracy,0.46546184738955826
-xnli_vi,median,accuracy,0.46546184738955826
-xnli_zh,GPT-3 style_zhht,accuracy,0.36947791164658633
-xnli_zh,MNLI crowdsource_zhht,accuracy,0.3457831325301205
-xnli_zh,can we infer_zhht,accuracy,0.3441767068273092
-xnli_zh,guaranteed/possible/impossible_zhht,accuracy,0.4923694779116466
-xnli_zh,justified in saying_zhht,accuracy,0.3927710843373494
-xnli_zh,median,accuracy,0.36947791164658633
-multiple,average,multiple,0.4354417670682731
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/merged.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/merged.json
deleted file mode 100644
index 2d843c2753dd0cbff8718e6589bb67fa198770c6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"xnli_ar": {"GPT-3 style_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40441767068273093}, "template_name": "GPT-3 style_arht"}, "MNLI crowdsource_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43012048192771085}, "template_name": "MNLI crowdsource_arht"}, "can we infer_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3610441767068273}, "template_name": "can we infer_arht"}, "guaranteed/possible/impossible_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3642570281124498}, "template_name": "guaranteed/possible/impossible_arht"}, "justified in saying_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37309236947791163}, "template_name": "justified in saying_arht"}}, "xnli_es": {"GPT-3 style_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5698795180722892}, "template_name": "GPT-3 style_esht"}, "MNLI crowdsource_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.342570281124498}, "template_name": "MNLI crowdsource_esht"}, "can we infer_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.46546184738955826}, "template_name": "can we infer_esht"}, "guaranteed/possible/impossible_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5526104417670683}, "template_name": "guaranteed/possible/impossible_esht"}, "justified in saying_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4321285140562249}, "template_name": "justified in saying_esht"}}, "xnli_fr": {"GPT-3 style_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4995983935742972}, "template_name": "GPT-3 style_frht"}, "MNLI crowdsource_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4004016064257028}, "template_name": "MNLI crowdsource_frht"}, "can we infer_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5694779116465863}, "template_name": "can we infer_frht"}, "guaranteed/possible/impossible_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5152610441767068}, "template_name": "guaranteed/possible/impossible_frht"}, "justified in saying_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5493975903614458}, "template_name": "justified in saying_frht"}}, "xnli_hi": {"GPT-3 style_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44417670682730925}, "template_name": "GPT-3 style_hiht"}, "MNLI crowdsource_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5236947791164659}, "template_name": "MNLI crowdsource_hiht"}, "can we infer_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4963855421686747}, "template_name": "can we infer_hiht"}, "guaranteed/possible/impossible_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4493975903614458}, "template_name": "guaranteed/possible/impossible_hiht"}, "justified in saying_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4963855421686747}, "template_name": "justified in saying_hiht"}}, "xnli_sw": {"GPT-3 style_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39397590361445783}, "template_name": "GPT-3 style_swht"}, "MNLI crowdsource_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3329317269076305}, "template_name": "MNLI crowdsource_swht"}, "can we infer_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4285140562248996}, "template_name": "can we infer_swht"}, "guaranteed/possible/impossible_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38433734939759034}, "template_name": "guaranteed/possible/impossible_swht"}, "justified in saying_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41967871485943775}, "template_name": "justified in saying_swht"}}, "xnli_ur": {"GPT-3 style_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.463855421686747}, "template_name": "GPT-3 style_urht"}, "MNLI crowdsource_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40441767068273093}, "template_name": "MNLI crowdsource_urht"}, "can we infer_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3895582329317269}, "template_name": "can we infer_urht"}, "guaranteed/possible/impossible_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3405622489959839}, "template_name": "guaranteed/possible/impossible_urht"}, "justified in saying_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43293172690763054}, "template_name": "justified in saying_urht"}}, "xnli_vi": {"GPT-3 style_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5261044176706827}, "template_name": "GPT-3 style_viht"}, "MNLI crowdsource_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39879518072289155}, "template_name": "MNLI crowdsource_viht"}, "can we infer_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5481927710843374}, "template_name": "can we infer_viht"}, "guaranteed/possible/impossible_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43694779116465865}, "template_name": "guaranteed/possible/impossible_viht"}, "justified in saying_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.46546184738955826}, "template_name": "justified in saying_viht"}}, "xnli_zh": {"GPT-3 style_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.36947791164658633}, "template_name": "GPT-3 style_zhht"}, "MNLI crowdsource_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3457831325301205}, "template_name": "MNLI crowdsource_zhht"}, "can we infer_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3441767068273092}, "template_name": "can we infer_zhht"}, "guaranteed/possible/impossible_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4923694779116466}, "template_name": "guaranteed/possible/impossible_zhht"}, "justified in saying_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3927710843373494}, "template_name": "justified in saying_zhht"}}}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json
deleted file mode 100644
index d7b7b923b78fb24fbccb524ad75690014a17cac7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "GPT-3 style_swht",
-  "evaluation": {
-    "accuracy": 0.39397590361445783
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json
deleted file mode 100644
index de9fd9c9a0227ffb16da0b5303649cad3dfccb9f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "MNLI crowdsource_swht",
-  "evaluation": {
-    "accuracy": 0.3329317269076305
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json
deleted file mode 100644
index ac81892cb9c74f6107dafad135338f6d9683507d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "can we infer_swht",
-  "evaluation": {
-    "accuracy": 0.4285140562248996
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json
deleted file mode 100644
index 9ba9a00506c2e6fc4a16f3e21fd815d44dbca4a5..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "guaranteed/possible/impossible_swht",
-  "evaluation": {
-    "accuracy": 0.38433734939759034
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json
deleted file mode 100644
index 5b3e944be5fd9d9d99fb1b9a48e933a0837aa80f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "justified in saying_swht",
-  "evaluation": {
-    "accuracy": 0.41967871485943775
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json
deleted file mode 100644
index 2a8b24bdd46252fb9a65c95158e410ba06d95808..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "GPT-3 style_urht",
-  "evaluation": {
-    "accuracy": 0.463855421686747
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json
deleted file mode 100644
index daec90e79a511836f4cb80c164414151fd805e8d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "MNLI crowdsource_urht",
-  "evaluation": {
-    "accuracy": 0.40441767068273093
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json
deleted file mode 100644
index 0b83201fc43fdb6641cc4783c7008f87aed55bad..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "can we infer_urht",
-  "evaluation": {
-    "accuracy": 0.3895582329317269
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json
deleted file mode 100644
index 5c63d51d73fa542f757e07787912f164ca05d995..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "guaranteed/possible/impossible_urht",
-  "evaluation": {
-    "accuracy": 0.3405622489959839
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json
deleted file mode 100644
index bdd9008231968eb2e6743a6a45934a07906fad79..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "justified in saying_urht",
-  "evaluation": {
-    "accuracy": 0.43293172690763054
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json
deleted file mode 100644
index f4e01f9fb887fe7de2f4fe46d01345fd92a6d510..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "GPT-3 style_viht",
-  "evaluation": {
-    "accuracy": 0.5261044176706827
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json
deleted file mode 100644
index 8520acbf68d09bc928c0dfb7b737a41e17594de3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "MNLI crowdsource_viht",
-  "evaluation": {
-    "accuracy": 0.39879518072289155
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json
deleted file mode 100644
index 7b0bdaec60efc2784c288f22715d50e346280108..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "can we infer_viht",
-  "evaluation": {
-    "accuracy": 0.5481927710843374
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json
deleted file mode 100644
index 097d91dcde196e9d9496f46fa4581108a8bd4494..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "guaranteed/possible/impossible_viht",
-  "evaluation": {
-    "accuracy": 0.43694779116465865
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json
deleted file mode 100644
index 5efd3a1377077b27e6da271aae29d9ef8e3b883c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "justified in saying_viht",
-  "evaluation": {
-    "accuracy": 0.46546184738955826
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json
deleted file mode 100644
index 553709670d2c50865f09e85fa119e026995287ea..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "GPT-3 style_zhht",
-  "evaluation": {
-    "accuracy": 0.36947791164658633
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json
deleted file mode 100644
index efa8c119c81082c69eba765ada8008d2f34e146e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "MNLI crowdsource_zhht",
-  "evaluation": {
-    "accuracy": 0.3457831325301205
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json
deleted file mode 100644
index f8cbc3083571d601c942d7fc15d93640c6c7eaa8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "can we infer_zhht",
-  "evaluation": {
-    "accuracy": 0.3441767068273092
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json
deleted file mode 100644
index a1e72172e6d797ddefbbd510348113cc78ac67a2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "guaranteed/possible/impossible_zhht",
-  "evaluation": {
-    "accuracy": 0.4923694779116466
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json b/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json
deleted file mode 100644
index 5a0ba1b45a09e30711abbcfd3ff8c6bf9ae068fd..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "justified in saying_zhht",
-  "evaluation": {
-    "accuracy": 0.3927710843373494
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json
deleted file mode 100644
index d3df8a608a2418831da7a727daaf768fc29d643a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "GPT-3 style_armt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json
deleted file mode 100644
index 484fea8074d4096940c1cf3e4cb14838e1ba76f8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "MNLI crowdsource_armt",
-  "evaluation": {
-    "accuracy": 0.4542168674698795
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json
deleted file mode 100644
index e9b7ea9ed0effc3aa4313b68b2bdebbd1a48c5a6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "can we infer_armt",
-  "evaluation": {
-    "accuracy": 0.41967871485943775
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json
deleted file mode 100644
index 2c8ceb85c15b119101189af34b5928207747a25b..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "guaranteed/possible/impossible_armt",
-  "evaluation": {
-    "accuracy": 0.3795180722891566
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json
deleted file mode 100644
index f770884c9af2373782a2991969239d8e9ac36957..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "justified in saying_armt",
-  "evaluation": {
-    "accuracy": 0.44016064257028115
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json
deleted file mode 100644
index fb0db17b3f51d113c8094890d8e3387427676096..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "GPT-3 style_esmt",
-  "evaluation": {
-    "accuracy": 0.5381526104417671
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json
deleted file mode 100644
index c0297e49e925ab677bb63ea7548be8f16ca193e0..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "MNLI crowdsource_esmt",
-  "evaluation": {
-    "accuracy": 0.4951807228915663
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json
deleted file mode 100644
index 16f5ae1d0c3637385fc3b3ac9109610bcab41caf..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "can we infer_esmt",
-  "evaluation": {
-    "accuracy": 0.4951807228915663
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json
deleted file mode 100644
index 7de5aec8f541506b3bd55756b31ee8e84edd71de..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "guaranteed/possible/impossible_esmt",
-  "evaluation": {
-    "accuracy": 0.3349397590361446
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json
deleted file mode 100644
index 026c7c34328b7831028b3709d9a8918c87d9cbb6..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "justified in saying_esmt",
-  "evaluation": {
-    "accuracy": 0.4955823293172691
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json
deleted file mode 100644
index f7274d952c0de2c942814fd124fdbaca283485ff..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "GPT-3 style_frmt",
-  "evaluation": {
-    "accuracy": 0.4746987951807229
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json
deleted file mode 100644
index 4e2e6908b8e98c832eae2b7c01606c6a7e4fbe3e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "MNLI crowdsource_frmt",
-  "evaluation": {
-    "accuracy": 0.3538152610441767
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json
deleted file mode 100644
index 747591c5d05092765d34ce831b80c8247c686b4d..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "can we infer_frmt",
-  "evaluation": {
-    "accuracy": 0.5481927710843374
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json
deleted file mode 100644
index e68c0401faae36b7d5dd65ad7f70ff83b2017a99..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "guaranteed/possible/impossible_frmt",
-  "evaluation": {
-    "accuracy": 0.5200803212851406
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json
deleted file mode 100644
index e090b6a7eb4dd90f5841a558b077302d3215dc06..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "justified in saying_frmt",
-  "evaluation": {
-    "accuracy": 0.5317269076305221
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json
deleted file mode 100644
index 6d2f4a90d39fd21e0d000fd2bddde19b805d14b7..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "GPT-3 style_himt",
-  "evaluation": {
-    "accuracy": 0.43734939759036146
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json
deleted file mode 100644
index abe331491e50afd38a294d6ba0a85fe7938d63ed..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "MNLI crowdsource_himt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json
deleted file mode 100644
index 15a23fa63fdb4337027f4313554e5ea759f1b6ff..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "can we infer_himt",
-  "evaluation": {
-    "accuracy": 0.4795180722891566
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json
deleted file mode 100644
index ed3e0ab27f59a22926fd7a9183629f8a4874116f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "guaranteed/possible/impossible_himt",
-  "evaluation": {
-    "accuracy": 0.44136546184738956
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json
deleted file mode 100644
index 6f63b529de724ddad08f41a3e982301790a159f8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "justified in saying_himt",
-  "evaluation": {
-    "accuracy": 0.4931726907630522
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/merged.csv b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/merged.csv
deleted file mode 100644
index 75c13d2a1e9948b86e1e443ac0845ada11e93b59..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/merged.csv
+++ /dev/null
@@ -1,50 +0,0 @@
-dataset,prompt,metric,value
-xnli_ar,GPT-3 style_armt,accuracy,0.3333333333333333
-xnli_ar,MNLI crowdsource_armt,accuracy,0.4542168674698795
-xnli_ar,can we infer_armt,accuracy,0.41967871485943775
-xnli_ar,guaranteed/possible/impossible_armt,accuracy,0.3795180722891566
-xnli_ar,justified in saying_armt,accuracy,0.44016064257028115
-xnli_ar,median,accuracy,0.41967871485943775
-xnli_es,GPT-3 style_esmt,accuracy,0.5381526104417671
-xnli_es,MNLI crowdsource_esmt,accuracy,0.4951807228915663
-xnli_es,can we infer_esmt,accuracy,0.4951807228915663
-xnli_es,guaranteed/possible/impossible_esmt,accuracy,0.3349397590361446
-xnli_es,justified in saying_esmt,accuracy,0.4955823293172691
-xnli_es,median,accuracy,0.4951807228915663
-xnli_fr,GPT-3 style_frmt,accuracy,0.4746987951807229
-xnli_fr,MNLI crowdsource_frmt,accuracy,0.3538152610441767
-xnli_fr,can we infer_frmt,accuracy,0.5481927710843374
-xnli_fr,guaranteed/possible/impossible_frmt,accuracy,0.5200803212851406
-xnli_fr,justified in saying_frmt,accuracy,0.5317269076305221
-xnli_fr,median,accuracy,0.5200803212851406
-xnli_hi,GPT-3 style_himt,accuracy,0.43734939759036146
-xnli_hi,MNLI crowdsource_himt,accuracy,0.3333333333333333
-xnli_hi,can we infer_himt,accuracy,0.4795180722891566
-xnli_hi,guaranteed/possible/impossible_himt,accuracy,0.44136546184738956
-xnli_hi,justified in saying_himt,accuracy,0.4931726907630522
-xnli_hi,median,accuracy,0.44136546184738956
-xnli_sw,GPT-3 style_swmt,accuracy,0.3357429718875502
-xnli_sw,MNLI crowdsource_swmt,accuracy,0.3353413654618474
-xnli_sw,can we infer_swmt,accuracy,0.3682730923694779
-xnli_sw,guaranteed/possible/impossible_swmt,accuracy,0.351004016064257
-xnli_sw,justified in saying_swmt,accuracy,0.36305220883534134
-xnli_sw,median,accuracy,0.351004016064257
-xnli_ur,GPT-3 style_urmt,accuracy,0.3586345381526104
-xnli_ur,MNLI crowdsource_urmt,accuracy,0.3369477911646586
-xnli_ur,can we infer_urmt,accuracy,0.351004016064257
-xnli_ur,guaranteed/possible/impossible_urmt,accuracy,0.3337349397590361
-xnli_ur,justified in saying_urmt,accuracy,0.3381526104417671
-xnli_ur,median,accuracy,0.3381526104417671
-xnli_vi,GPT-3 style_vimt,accuracy,0.3333333333333333
-xnli_vi,MNLI crowdsource_vimt,accuracy,0.3333333333333333
-xnli_vi,can we infer_vimt,accuracy,0.3333333333333333
-xnli_vi,guaranteed/possible/impossible_vimt,accuracy,0.3333333333333333
-xnli_vi,justified in saying_vimt,accuracy,0.3333333333333333
-xnli_vi,median,accuracy,0.3333333333333333
-xnli_zh,GPT-3 style_zhmt,accuracy,0.5224899598393574
-xnli_zh,MNLI crowdsource_zhmt,accuracy,0.4542168674698795
-xnli_zh,can we infer_zhmt,accuracy,0.5184738955823294
-xnli_zh,guaranteed/possible/impossible_zhmt,accuracy,0.334136546184739
-xnli_zh,justified in saying_zhmt,accuracy,0.4955823293172691
-xnli_zh,median,accuracy,0.4955823293172691
-multiple,average,multiple,0.4242971887550201
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/merged.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/merged.json
deleted file mode 100644
index 6093ec1c3c21248845b11f0f8607678434710f15..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"xnli_ar": {"GPT-3 style_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_armt"}, "MNLI crowdsource_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4542168674698795}, "template_name": "MNLI crowdsource_armt"}, "can we infer_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41967871485943775}, "template_name": "can we infer_armt"}, "guaranteed/possible/impossible_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3795180722891566}, "template_name": "guaranteed/possible/impossible_armt"}, "justified in saying_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44016064257028115}, "template_name": "justified in saying_armt"}}, "xnli_es": {"GPT-3 style_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5381526104417671}, "template_name": "GPT-3 style_esmt"}, "MNLI crowdsource_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4951807228915663}, "template_name": "MNLI crowdsource_esmt"}, "can we infer_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4951807228915663}, "template_name": "can we infer_esmt"}, "guaranteed/possible/impossible_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3349397590361446}, "template_name": "guaranteed/possible/impossible_esmt"}, "justified in saying_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4955823293172691}, "template_name": "justified in saying_esmt"}}, "xnli_fr": {"GPT-3 style_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4746987951807229}, "template_name": "GPT-3 style_frmt"}, "MNLI crowdsource_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3538152610441767}, "template_name": "MNLI crowdsource_frmt"}, "can we infer_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5481927710843374}, "template_name": "can we infer_frmt"}, "guaranteed/possible/impossible_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5200803212851406}, "template_name": "guaranteed/possible/impossible_frmt"}, "justified in saying_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5317269076305221}, "template_name": "justified in saying_frmt"}}, "xnli_hi": {"GPT-3 style_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43734939759036146}, "template_name": "GPT-3 style_himt"}, "MNLI crowdsource_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_himt"}, "can we infer_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4795180722891566}, "template_name": "can we infer_himt"}, "guaranteed/possible/impossible_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44136546184738956}, "template_name": "guaranteed/possible/impossible_himt"}, "justified in saying_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4931726907630522}, "template_name": "justified in saying_himt"}}, "xnli_sw": {"GPT-3 style_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3357429718875502}, "template_name": "GPT-3 style_swmt"}, "MNLI crowdsource_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3353413654618474}, "template_name": "MNLI crowdsource_swmt"}, "can we infer_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3682730923694779}, "template_name": "can we infer_swmt"}, "guaranteed/possible/impossible_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.351004016064257}, "template_name": "guaranteed/possible/impossible_swmt"}, "justified in saying_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.36305220883534134}, "template_name": "justified in saying_swmt"}}, "xnli_ur": {"GPT-3 style_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3586345381526104}, "template_name": "GPT-3 style_urmt"}, "MNLI crowdsource_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3369477911646586}, "template_name": "MNLI crowdsource_urmt"}, "can we infer_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.351004016064257}, "template_name": "can we infer_urmt"}, "guaranteed/possible/impossible_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3337349397590361}, "template_name": "guaranteed/possible/impossible_urmt"}, "justified in saying_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3381526104417671}, "template_name": "justified in saying_urmt"}}, "xnli_vi": {"GPT-3 style_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_vimt"}, "MNLI crowdsource_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_vimt"}, "can we infer_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_vimt"}, "guaranteed/possible/impossible_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "guaranteed/possible/impossible_vimt"}, "justified in saying_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "justified in saying_vimt"}}, "xnli_zh": {"GPT-3 style_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5224899598393574}, "template_name": "GPT-3 style_zhmt"}, "MNLI crowdsource_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4542168674698795}, "template_name": "MNLI crowdsource_zhmt"}, "can we infer_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5184738955823294}, "template_name": "can we infer_zhmt"}, "guaranteed/possible/impossible_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.334136546184739}, "template_name": "guaranteed/possible/impossible_zhmt"}, "justified in saying_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4955823293172691}, "template_name": "justified in saying_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json
deleted file mode 100644
index 7af2ade81619d1d5d680f533d2d718004ac47e9f..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "GPT-3 style_swmt",
-  "evaluation": {
-    "accuracy": 0.3357429718875502
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json
deleted file mode 100644
index 1a7435f3ab6279405fe0758f869cb8e13c33482e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "MNLI crowdsource_swmt",
-  "evaluation": {
-    "accuracy": 0.3353413654618474
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json
deleted file mode 100644
index a09fc6d4bbbeee4f4f9f9c803acfaf3b1c48c451..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "can we infer_swmt",
-  "evaluation": {
-    "accuracy": 0.3682730923694779
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json
deleted file mode 100644
index f15426fe5979ede1ee47fc0edfcd025c24963e85..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "guaranteed/possible/impossible_swmt",
-  "evaluation": {
-    "accuracy": 0.351004016064257
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json
deleted file mode 100644
index 90394faebea44e1c6d7894b2cc89c522ebd8df7a..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "justified in saying_swmt",
-  "evaluation": {
-    "accuracy": 0.36305220883534134
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json
deleted file mode 100644
index 0282d0be6303786a180f50618e07877161f505b3..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "GPT-3 style_urmt",
-  "evaluation": {
-    "accuracy": 0.3586345381526104
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json
deleted file mode 100644
index 611968c8b90be64d7d60113395460c0fd457d2b2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "MNLI crowdsource_urmt",
-  "evaluation": {
-    "accuracy": 0.3369477911646586
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json
deleted file mode 100644
index 4c1ae10e00ad59b9af33932760d44cc12ea1f8ee..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "can we infer_urmt",
-  "evaluation": {
-    "accuracy": 0.351004016064257
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json
deleted file mode 100644
index 35f9483bd5e19259d6f473f0c4d973e39f80fca4..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "guaranteed/possible/impossible_urmt",
-  "evaluation": {
-    "accuracy": 0.3337349397590361
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json
deleted file mode 100644
index ca92fbad1769d545db2d32b0cc2cd21fc2531536..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "justified in saying_urmt",
-  "evaluation": {
-    "accuracy": 0.3381526104417671
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json
deleted file mode 100644
index 58246bf4d433606693cb97adc8622b6fa1c74e4c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "GPT-3 style_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json
deleted file mode 100644
index 5dd3868927989b006df5b17daa4cc23d5238ad22..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "MNLI crowdsource_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json
deleted file mode 100644
index b73e0f205f3841b05d86a9449c601b8b6f111e5e..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "can we infer_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json
deleted file mode 100644
index 39992e80223b4a30b931b52f708a7838f6061c8c..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "guaranteed/possible/impossible_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json
deleted file mode 100644
index e723b8a7e36d3a1063dfd79f26ee3443b922fbb8..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "justified in saying_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json
deleted file mode 100644
index e1ae998ee8df46751b4fe8f0cb439cb1a29acbea..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "GPT-3 style_zhmt",
-  "evaluation": {
-    "accuracy": 0.5224899598393574
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json
deleted file mode 100644
index 2b524f2a25b623cbb28fe5d4ebd40ac24d4578c2..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "MNLI crowdsource_zhmt",
-  "evaluation": {
-    "accuracy": 0.4542168674698795
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json
deleted file mode 100644
index 7a9e5b9cd1bd435fe6320882569918515362eecd..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "can we infer_zhmt",
-  "evaluation": {
-    "accuracy": 0.5184738955823294
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json
deleted file mode 100644
index 9af567d30784f4d6ccd77e76af4f9c84323d92db..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "guaranteed/possible/impossible_zhmt",
-  "evaluation": {
-    "accuracy": 0.334136546184739
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json b/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json
deleted file mode 100644
index 9c6db375f86ea1876f4eca2db924d921d1ce7b11..0000000000000000000000000000000000000000
--- a/evaluation_bloomz-mt/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "justified in saying_zhmt",
-  "evaluation": {
-    "accuracy": 0.4955823293172691
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/tr13-176b-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file