diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..997db62bacd35bf6fd1413f472dadb8852f58fc6
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Answer Given options_armt",
+  "evaluation": {
+    "accuracy": 0.8669755129053607
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6624cfb82fbbeef52b8c91bd7310bd6a5ff74d2
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Choose Story Ending_armt",
+  "evaluation": {
+    "accuracy": 0.927862342819325
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3eecdd22bb453a1c14e2920642ded6a23b0219f
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Generate Ending_armt",
+  "evaluation": {
+    "accuracy": 0.6479152878888154
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..719ac2f7efad4d8653d2045376adfef115b1d1ff
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Novel Correct Ending_armt",
+  "evaluation": {
+    "accuracy": 0.9185969556585043
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..06f78140700466f8a0d3b110efdec902fb043873
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "ar",
+  "template_name": "Story Continuation and Options_armt",
+  "evaluation": {
+    "accuracy": 0.9113170086035738
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..02ce2915c686413c72e27889e2aa2611031ca219
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Answer Given options_esmt",
+  "evaluation": {
+    "accuracy": 0.9272005294506949
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..461ec3b69f608aad5131870d0f2c82967d5077f1
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Choose Story Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.9437458636664461
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..86cc9454e246bd329a9cee28e4ffe1d5a3df975e
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Generate Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.7445400397088021
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..91260d2529c5cc0ff59ad8f7053d2c612c1001b3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Novel Correct Ending_esmt",
+  "evaluation": {
+    "accuracy": 0.9397749834546658
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8788445181e248932e17565c89b7ea5a80516e1c
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "es",
+  "template_name": "Story Continuation and Options_esmt",
+  "evaluation": {
+    "accuracy": 0.9298477829252151
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7269c66d0381e9ae2735cbec048dc3c2516ccf3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Answer Given options_eumt",
+  "evaluation": {
+    "accuracy": 0.7452018530774321
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c0bd6a769ecec0bd8e5cc328e99ee5aeacbcc23
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Choose Story Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.8676373262739907
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..02a0225b6ad750a9f2b97c43f26414ac41bae4d2
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Generate Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.6082064857710126
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb41b6113f3ef416b2e16282ca3321bba5df6479
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Novel Correct Ending_eumt",
+  "evaluation": {
+    "accuracy": 0.8219722038385175
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2caa1a5cea2c53cc1a7fc082208cd65f8c0fbed6
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "eu",
+  "template_name": "Story Continuation and Options_eumt",
+  "evaluation": {
+    "accuracy": 0.814030443414957
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..66aa1a2cb0d0feefb4d63babaf0860122bfa6ad8
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Answer Given options_himt",
+  "evaluation": {
+    "accuracy": 0.8266048974189278
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a84769bba7cc4d0418413f16ae10bf86a10bcc00
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Choose Story Ending_himt",
+  "evaluation": {
+    "accuracy": 0.8841826604897419
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a7fdd02fd8aed986f289c9dca213443a2f132d9
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Generate Ending_himt",
+  "evaluation": {
+    "accuracy": 0.657180675049636
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b31f5dde4314c1213de653be72d6c79e082671ca
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Novel Correct Ending_himt",
+  "evaluation": {
+    "accuracy": 0.8669755129053607
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ce81f8c8e52790371b0ee3c813e033cf2454442
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "hi",
+  "template_name": "Story Continuation and Options_himt",
+  "evaluation": {
+    "accuracy": 0.8689609530112509
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d95134fc2ac94f77a05aa2ce64c3bdf54e09c1b4
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Answer Given options_idmt",
+  "evaluation": {
+    "accuracy": 0.8616810059563204
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..599f3cceae2713f083efe3107f1cceb999feffd7
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Choose Story Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.914626075446724
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..175390828a8a75e99850b888d28f4e631db746ee
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Generate Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.6730641958967571
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..53b1368acd28913356ba6aabf207235b9e930d45
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Novel Correct Ending_idmt",
+  "evaluation": {
+    "accuracy": 0.8954334877564527
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9349c7d824bf4cbb001043faccedc703f6105b2
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "id",
+  "template_name": "Story Continuation and Options_idmt",
+  "evaluation": {
+    "accuracy": 0.8927862342819325
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9413481a2cb936538de7f6567537f57510efc31
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Answer Given options_zhmt",
+  "evaluation": {
+    "accuracy": 0.9060225016545335
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3848e3b6010638e5baa3605adcb2ed4e7565e872
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Choose Story Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.9238914626075446
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82388d3615260ef92a1a29f90aa5a65ccb88b2ef
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Generate Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.686962276637988
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..54c1d2c5dd81fa11a302a50c6d963e2920425905
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Novel Correct Ending_zhmt",
+  "evaluation": {
+    "accuracy": 0.9185969556585043
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab07c990e89ee646528b8a9ec96060f38b3f9799
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xstory_cloze",
+  "dataset_config_name": "zh",
+  "template_name": "Story Continuation and Options_zhmt",
+  "evaluation": {
+    "accuracy": 0.9192587690271343
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3372322aac8e5d3108f503eec3b6d9957b1eff1
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "Replace_frmt",
+  "evaluation": {
+    "accuracy": 0.6506024096385542
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d736e10d0ad8f235c8626b055de1b0f8e01ac5bc
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "True or False_frmt",
+  "evaluation": {
+    "accuracy": 0.5662650602409639
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3892016acc8c698db5e40b53f12b2283e243512d
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "does underscore refer to_frmt",
+  "evaluation": {
+    "accuracy": 0.5542168674698795
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..70e2893ab1283473fe670d6b9c1559319c97ad14
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "stand for_frmt",
+  "evaluation": {
+    "accuracy": 0.4819277108433735
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9dbde34f02fcdaff994a777d2ace720425c36a96
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "fr",
+  "template_name": "underscore refer to_frmt",
+  "evaluation": {
+    "accuracy": 0.6144578313253012
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d395d02cb45165e2739532fcd583fec2b67d810
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "Replace_ptmt",
+  "evaluation": {
+    "accuracy": 0.6425855513307985
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9efc4706c4ab11835d68991128cc5decbda26a1
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "True or False_ptmt",
+  "evaluation": {
+    "accuracy": 0.49809885931558934
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a633238a24aeb37cc8331ccb4e449cfde8fe687
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "does underscore refer to_ptmt",
+  "evaluation": {
+    "accuracy": 0.6045627376425855
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..83427e2277d192918a6bad4b84f9513d8377c356
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "stand for_ptmt",
+  "evaluation": {
+    "accuracy": 0.5095057034220533
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..df1a9dea52cf887e4794c7c459663da099890361
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "pt",
+  "template_name": "underscore refer to_ptmt",
+  "evaluation": {
+    "accuracy": 0.6273764258555133
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..521b05f635446268ac7e469eb44c341e724df736
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "Replace_zhmt",
+  "evaluation": {
+    "accuracy": 0.6845238095238095
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..babb0d1084f0781ee2f6990cc6963fecdff81cce
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "True or False_zhmt",
+  "evaluation": {
+    "accuracy": 0.503968253968254
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..076524ebe2c67a402da793a9ab9af6bd4a86cef3
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "does underscore refer to_zhmt",
+  "evaluation": {
+    "accuracy": 0.626984126984127
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..78853a372526204eafbeaec7f53bdebd663676c1
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "stand for_zhmt",
+  "evaluation": {
+    "accuracy": 0.503968253968254
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..af13eaee7579fc491fc6899915af192a937c7580
--- /dev/null
+++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "Muennighoff/xwinograd",
+  "dataset_config_name": "zh",
+  "template_name": "underscore refer to_zhmt",
+  "evaluation": {
+    "accuracy": 0.7023809523809523
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/merged.csv b/evaluation_xcopawinostorymt/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..24d3f440c11ab4c7ff93650308d34960b1bae599
--- /dev/null
+++ b/evaluation_xcopawinostorymt/merged.csv
@@ -0,0 +1,86 @@
+dataset,prompt,metric,value
+xcopa_id,C1 or C2? premise_idmt,accuracy,0.52
+xcopa_id,best_option_idmt,accuracy,0.73
+xcopa_id,cause_effect_idmt,accuracy,0.82
+xcopa_id,i_am_hesitating_idmt,accuracy,0.76
+xcopa_id,plausible_alternatives_idmt,accuracy,0.78
+xcopa_id,median,accuracy,0.76
+xcopa_sw,C1 or C2? premise_swmt,accuracy,0.59
+xcopa_sw,best_option_swmt,accuracy,0.62
+xcopa_sw,cause_effect_swmt,accuracy,0.64
+xcopa_sw,i_am_hesitating_swmt,accuracy,0.63
+xcopa_sw,plausible_alternatives_swmt,accuracy,0.64
+xcopa_sw,median,accuracy,0.63
+xcopa_ta,C1 or C2? premise_tamt,accuracy,0.6
+xcopa_ta,best_option_tamt,accuracy,0.47
+xcopa_ta,cause_effect_tamt,accuracy,0.62
+xcopa_ta,i_am_hesitating_tamt,accuracy,0.64
+xcopa_ta,plausible_alternatives_tamt,accuracy,0.63
+xcopa_ta,median,accuracy,0.62
+xcopa_vi,C1 or C2? premise_vimt,accuracy,0.56
+xcopa_vi,best_option_vimt,accuracy,0.77
+xcopa_vi,cause_effect_vimt,accuracy,0.84
+xcopa_vi,i_am_hesitating_vimt,accuracy,0.82
+xcopa_vi,plausible_alternatives_vimt,accuracy,0.84
+xcopa_vi,median,accuracy,0.82
+xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.62
+xcopa_zh,best_option_zhmt,accuracy,0.72
+xcopa_zh,cause_effect_zhmt,accuracy,0.89
+xcopa_zh,i_am_hesitating_zhmt,accuracy,0.9
+xcopa_zh,plausible_alternatives_zhmt,accuracy,0.86
+xcopa_zh,median,accuracy,0.86
+xstory_cloze_ar,Answer Given options_armt,accuracy,0.8669755129053607
+xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.927862342819325
+xstory_cloze_ar,Generate Ending_armt,accuracy,0.6479152878888154
+xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.9185969556585043
+xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.9113170086035738
+xstory_cloze_ar,median,accuracy,0.9113170086035738
+xstory_cloze_es,Answer Given options_esmt,accuracy,0.9272005294506949
+xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.9437458636664461
+xstory_cloze_es,Generate Ending_esmt,accuracy,0.7445400397088021
+xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.9397749834546658
+xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.9298477829252151
+xstory_cloze_es,median,accuracy,0.9298477829252151
+xstory_cloze_eu,Answer Given options_eumt,accuracy,0.7452018530774321
+xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.8676373262739907
+xstory_cloze_eu,Generate Ending_eumt,accuracy,0.6082064857710126
+xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.8219722038385175
+xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.814030443414957
+xstory_cloze_eu,median,accuracy,0.814030443414957
+xstory_cloze_hi,Answer Given options_himt,accuracy,0.8266048974189278
+xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.8841826604897419
+xstory_cloze_hi,Generate Ending_himt,accuracy,0.657180675049636
+xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.8669755129053607
+xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.8689609530112509
+xstory_cloze_hi,median,accuracy,0.8669755129053607
+xstory_cloze_id,Answer Given options_idmt,accuracy,0.8616810059563204
+xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.914626075446724
+xstory_cloze_id,Generate Ending_idmt,accuracy,0.6730641958967571
+xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.8954334877564527
+xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.8927862342819325
+xstory_cloze_id,median,accuracy,0.8927862342819325
+xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.9060225016545335
+xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.9238914626075446
+xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.686962276637988
+xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.9185969556585043
+xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.9192587690271343
+xstory_cloze_zh,median,accuracy,0.9185969556585043
+xwinograd_fr,Replace_frmt,accuracy,0.6506024096385542
+xwinograd_fr,True or False_frmt,accuracy,0.5662650602409639
+xwinograd_fr,does underscore refer to_frmt,accuracy,0.5542168674698795
+xwinograd_fr,stand for_frmt,accuracy,0.4819277108433735
+xwinograd_fr,underscore refer to_frmt,accuracy,0.6144578313253012
+xwinograd_fr,median,accuracy,0.5662650602409639
+xwinograd_pt,Replace_ptmt,accuracy,0.6425855513307985
+xwinograd_pt,True or False_ptmt,accuracy,0.49809885931558934
+xwinograd_pt,does underscore refer to_ptmt,accuracy,0.6045627376425855
+xwinograd_pt,stand for_ptmt,accuracy,0.5095057034220533
+xwinograd_pt,underscore refer to_ptmt,accuracy,0.6273764258555133
+xwinograd_pt,median,accuracy,0.6045627376425855
+xwinograd_zh,Replace_zhmt,accuracy,0.6845238095238095
+xwinograd_zh,True or False_zhmt,accuracy,0.503968253968254
+xwinograd_zh,does underscore refer to_zhmt,accuracy,0.626984126984127
+xwinograd_zh,stand for_zhmt,accuracy,0.503968253968254
+xwinograd_zh,underscore refer to_zhmt,accuracy,0.7023809523809523
+xwinograd_zh,median,accuracy,0.626984126984127
+multiple,average,multiple,0.7729547044755157
diff --git a/evaluation_xcopawinostorymt/merged.json b/evaluation_xcopawinostorymt/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..475ec7d320796d8b63ccd3ab6fb46dfd89c597f9
--- /dev/null
+++ b/evaluation_xcopawinostorymt/merged.json
@@ -0,0 +1 @@
+{"Muennighoff/xstory_cloze_ar": {"Answer Given options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8669755129053607}, "template_name": "Answer Given options_armt"}, "Choose Story Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.927862342819325}, "template_name": "Choose Story Ending_armt"}, "Generate Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6479152878888154}, "template_name": "Generate Ending_armt"}, "Novel Correct Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9185969556585043}, "template_name": "Novel Correct Ending_armt"}, "Story Continuation and Options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9113170086035738}, "template_name": "Story Continuation and Options_armt"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9272005294506949}, "template_name": "Answer Given options_esmt"}, "Choose Story Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9437458636664461}, "template_name": "Choose Story Ending_esmt"}, "Generate Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7445400397088021}, "template_name": "Generate Ending_esmt"}, "Novel Correct Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9397749834546658}, "template_name": "Novel Correct Ending_esmt"}, "Story Continuation and Options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9298477829252151}, "template_name": "Story Continuation and Options_esmt"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7452018530774321}, "template_name": "Answer Given options_eumt"}, "Choose Story Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8676373262739907}, "template_name": "Choose Story Ending_eumt"}, "Generate Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6082064857710126}, "template_name": "Generate Ending_eumt"}, "Novel Correct Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8219722038385175}, "template_name": "Novel Correct Ending_eumt"}, "Story Continuation and Options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.814030443414957}, "template_name": "Story Continuation and Options_eumt"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8266048974189278}, "template_name": "Answer Given options_himt"}, "Choose Story Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8841826604897419}, "template_name": "Choose Story Ending_himt"}, "Generate Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.657180675049636}, "template_name": "Generate Ending_himt"}, "Novel Correct Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8669755129053607}, "template_name": "Novel Correct Ending_himt"}, "Story Continuation and Options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8689609530112509}, "template_name": "Story Continuation and Options_himt"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8616810059563204}, "template_name": "Answer Given options_idmt"}, "Choose Story Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.914626075446724}, "template_name": "Choose Story Ending_idmt"}, "Generate Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6730641958967571}, "template_name": "Generate Ending_idmt"}, "Novel Correct Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8954334877564527}, "template_name": "Novel Correct Ending_idmt"}, "Story Continuation and Options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8927862342819325}, "template_name": "Story Continuation and Options_idmt"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9060225016545335}, "template_name": "Answer Given options_zhmt"}, "Choose Story Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9238914626075446}, "template_name": "Choose Story Ending_zhmt"}, "Generate Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.686962276637988}, "template_name": "Generate Ending_zhmt"}, "Novel Correct Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9185969556585043}, "template_name": "Novel Correct Ending_zhmt"}, "Story Continuation and Options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9192587690271343}, "template_name": "Story Continuation and Options_zhmt"}}, "Muennighoff/xwinograd_fr": {"Replace_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6506024096385542}, "template_name": "Replace_frmt"}, "True or False_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5662650602409639}, "template_name": "True or False_frmt"}, "does underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5542168674698795}, "template_name": "does underscore refer to_frmt"}, "stand for_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4819277108433735}, "template_name": "stand for_frmt"}, "underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6144578313253012}, "template_name": "underscore refer to_frmt"}}, "Muennighoff/xwinograd_pt": {"Replace_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6425855513307985}, "template_name": "Replace_ptmt"}, "True or False_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49809885931558934}, "template_name": "True or False_ptmt"}, "does underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6045627376425855}, "template_name": "does underscore refer to_ptmt"}, "stand for_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5095057034220533}, "template_name": "stand for_ptmt"}, "underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6273764258555133}, "template_name": "underscore refer to_ptmt"}}, "Muennighoff/xwinograd_zh": {"Replace_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6845238095238095}, "template_name": "Replace_zhmt"}, "True or False_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.503968253968254}, "template_name": "True or False_zhmt"}, "does underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.626984126984127}, "template_name": "does underscore refer to_zhmt"}, "stand for_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.503968253968254}, "template_name": "stand for_zhmt"}, "underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7023809523809523}, "template_name": "underscore refer to_zhmt"}}, "xcopa_id": {"C1 or C2? premise_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.52}, "template_name": "C1 or C2? premise_idmt"}, "best_option_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.73}, "template_name": "best_option_idmt"}, "cause_effect_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.82}, "template_name": "cause_effect_idmt"}, "i_am_hesitating_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.76}, "template_name": "i_am_hesitating_idmt"}, "plausible_alternatives_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.78}, "template_name": "plausible_alternatives_idmt"}}, "xcopa_sw": {"C1 or C2? premise_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "C1 or C2? premise_swmt"}, "best_option_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "best_option_swmt"}, "cause_effect_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "cause_effect_swmt"}, "i_am_hesitating_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "i_am_hesitating_swmt"}, "plausible_alternatives_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives_swmt"}}, "xcopa_ta": {"C1 or C2? premise_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_tamt"}, "best_option_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.47}, "template_name": "best_option_tamt"}, "cause_effect_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "cause_effect_tamt"}, "i_am_hesitating_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "i_am_hesitating_tamt"}, "plausible_alternatives_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "plausible_alternatives_tamt"}}, "xcopa_vi": {"C1 or C2? premise_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "C1 or C2? premise_vimt"}, "best_option_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "best_option_vimt"}, "cause_effect_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "cause_effect_vimt"}, "i_am_hesitating_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.82}, "template_name": "i_am_hesitating_vimt"}, "plausible_alternatives_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "plausible_alternatives_vimt"}}, "xcopa_zh": {"C1 or C2? premise_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "C1 or C2? premise_zhmt"}, "best_option_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "best_option_zhmt"}, "cause_effect_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.89}, "template_name": "cause_effect_zhmt"}, "i_am_hesitating_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.9}, "template_name": "i_am_hesitating_zhmt"}, "plausible_alternatives_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "plausible_alternatives_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d18fed5573d9e7431c3cd9487581f8c564ab578
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "C1 or C2? premise_idmt",
+  "evaluation": {
+    "accuracy": 0.52
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f233ae5c456777aac5b1417d10ffd542c87ab48a
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "best_option_idmt",
+  "evaluation": {
+    "accuracy": 0.73
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3ddeb503bac6e9b74ab9f16ac0bd697979bfeaa
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "cause_effect_idmt",
+  "evaluation": {
+    "accuracy": 0.82
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6152a3d3f369ace4c26504a34ebf357c7b8bace
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "i_am_hesitating_idmt",
+  "evaluation": {
+    "accuracy": 0.76
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..725e426182a3238175ceaa5ab7cbab0b1e7f8111
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "id",
+  "template_name": "plausible_alternatives_idmt",
+  "evaluation": {
+    "accuracy": 0.78
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c7a00d76a6333bc34aa6601d2e68015df48f449
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "C1 or C2? premise_swmt",
+  "evaluation": {
+    "accuracy": 0.59
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..173ef6de7e74ff2b193350e7ba10a9baac445972
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "best_option_swmt",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dafce6a4ea8d20602aa5dca2eccd5084fdd8fae
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "cause_effect_swmt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f87f621ae05e69755b9c99d53b581ca5404b6c6
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "i_am_hesitating_swmt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e920992048a1dd677f8efe583fd6028eda0eb81
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "sw",
+  "template_name": "plausible_alternatives_swmt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..36cc67d63cc026f3fd5618fed9096f2db12a236f
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "C1 or C2? premise_tamt",
+  "evaluation": {
+    "accuracy": 0.6
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7729bbe69f7d2120564bd7f292583d853483e788
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "best_option_tamt",
+  "evaluation": {
+    "accuracy": 0.47
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d460b77dca312d4b564a3742cfe47ae5fd764aa0
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "cause_effect_tamt",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c65367a97ed502719e5a6416c0976e4d79c375e
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "i_am_hesitating_tamt",
+  "evaluation": {
+    "accuracy": 0.64
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..626d419e0b61a3931438550cfc2584491f93705a
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "ta",
+  "template_name": "plausible_alternatives_tamt",
+  "evaluation": {
+    "accuracy": 0.63
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..0798cf5718e3c2fa96370497cecbc51ceafb6023
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "C1 or C2? premise_vimt",
+  "evaluation": {
+    "accuracy": 0.56
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..51c5211271e532115de9f6764b8efda253123fa4
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "best_option_vimt",
+  "evaluation": {
+    "accuracy": 0.77
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5d213a526385f21ab89b0b3ce9bb37d56d50396
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "cause_effect_vimt",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9f4e840960efac6d65c98d80f1590f42689fe9d
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "i_am_hesitating_vimt",
+  "evaluation": {
+    "accuracy": 0.82
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..652ee68654626dd8e0e6328ad034d49da9c7dec0
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "vi",
+  "template_name": "plausible_alternatives_vimt",
+  "evaluation": {
+    "accuracy": 0.84
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..265db38422ff85e5df5a9f3e67b378c378ba07a4
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "C1 or C2? premise_zhmt",
+  "evaluation": {
+    "accuracy": 0.62
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c887673cfe5637380b482de72f660776fbeb4557
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "best_option_zhmt",
+  "evaluation": {
+    "accuracy": 0.72
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d9ef41457058926e54c6b244dcfff0e81e8bb6f
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "cause_effect_zhmt",
+  "evaluation": {
+    "accuracy": 0.89
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..63a9404e70cf90d3f6a89bb6794fb478e372cdda
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "i_am_hesitating_zhmt",
+  "evaluation": {
+    "accuracy": 0.9
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file
diff --git a/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json
new file mode 100644
index 0000000000000000000000000000000000000000..953e0df3d4da3f6375f232d68a6bf67b73cef1bc
--- /dev/null
+++ b/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json
@@ -0,0 +1,9 @@
+{
+  "dataset_name": "xcopa",
+  "dataset_config_name": "zh",
+  "template_name": "plausible_alternatives_zhmt",
+  "evaluation": {
+    "accuracy": 0.86
+  },
+  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
+}
\ No newline at end of file