diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..997db62bacd35bf6fd1413f472dadb8852f58fc6 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Answer Given options_armt", + "evaluation": { + "accuracy": 0.8669755129053607 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..e6624cfb82fbbeef52b8c91bd7310bd6a5ff74d2 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Choose Story Ending_armt", + "evaluation": { + "accuracy": 0.927862342819325 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..b3eecdd22bb453a1c14e2920642ded6a23b0219f --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Generate Ending_armt", + "evaluation": { + "accuracy": 0.6479152878888154 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..719ac2f7efad4d8653d2045376adfef115b1d1ff --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Novel Correct Ending_armt", + "evaluation": { + "accuracy": 0.9185969556585043 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..06f78140700466f8a0d3b110efdec902fb043873 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "ar", + "template_name": "Story Continuation and Options_armt", + "evaluation": { + "accuracy": 0.9113170086035738 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..02ce2915c686413c72e27889e2aa2611031ca219 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Answer Given options_esmt", + "evaluation": { + "accuracy": 0.9272005294506949 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..461ec3b69f608aad5131870d0f2c82967d5077f1 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Choose Story Ending_esmt", + "evaluation": { + "accuracy": 0.9437458636664461 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..86cc9454e246bd329a9cee28e4ffe1d5a3df975e --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Generate Ending_esmt", + "evaluation": { + "accuracy": 0.7445400397088021 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..91260d2529c5cc0ff59ad8f7053d2c612c1001b3 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Novel Correct Ending_esmt", + "evaluation": { + "accuracy": 0.9397749834546658 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..8788445181e248932e17565c89b7ea5a80516e1c --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "es", + "template_name": "Story Continuation and Options_esmt", + "evaluation": { + "accuracy": 0.9298477829252151 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..b7269c66d0381e9ae2735cbec048dc3c2516ccf3 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Answer Given options_eumt", + "evaluation": { + "accuracy": 0.7452018530774321 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0bd6a769ecec0bd8e5cc328e99ee5aeacbcc23 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Choose Story Ending_eumt", + "evaluation": { + "accuracy": 0.8676373262739907 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..02a0225b6ad750a9f2b97c43f26414ac41bae4d2 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Generate Ending_eumt", + "evaluation": { + "accuracy": 0.6082064857710126 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..fb41b6113f3ef416b2e16282ca3321bba5df6479 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Novel Correct Ending_eumt", + "evaluation": { + "accuracy": 0.8219722038385175 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..2caa1a5cea2c53cc1a7fc082208cd65f8c0fbed6 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "eu", + "template_name": "Story Continuation and Options_eumt", + "evaluation": { + "accuracy": 0.814030443414957 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..66aa1a2cb0d0feefb4d63babaf0860122bfa6ad8 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Answer Given options_himt", + "evaluation": { + "accuracy": 0.8266048974189278 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..a84769bba7cc4d0418413f16ae10bf86a10bcc00 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Choose Story Ending_himt", + "evaluation": { + "accuracy": 0.8841826604897419 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..6a7fdd02fd8aed986f289c9dca213443a2f132d9 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Generate Ending_himt", + "evaluation": { + "accuracy": 0.657180675049636 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..b31f5dde4314c1213de653be72d6c79e082671ca --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Novel Correct Ending_himt", + "evaluation": { + "accuracy": 0.8669755129053607 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..9ce81f8c8e52790371b0ee3c813e033cf2454442 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "hi", + "template_name": "Story Continuation and Options_himt", + "evaluation": { + "accuracy": 0.8689609530112509 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..d95134fc2ac94f77a05aa2ce64c3bdf54e09c1b4 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Answer Given options_idmt", + "evaluation": { + "accuracy": 0.8616810059563204 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..599f3cceae2713f083efe3107f1cceb999feffd7 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Choose Story Ending_idmt", + "evaluation": { + "accuracy": 0.914626075446724 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..175390828a8a75e99850b888d28f4e631db746ee --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Generate Ending_idmt", + "evaluation": { + "accuracy": 0.6730641958967571 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..53b1368acd28913356ba6aabf207235b9e930d45 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Novel Correct Ending_idmt", + "evaluation": { + "accuracy": 0.8954334877564527 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..e9349c7d824bf4cbb001043faccedc703f6105b2 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "id", + "template_name": "Story Continuation and Options_idmt", + "evaluation": { + "accuracy": 0.8927862342819325 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..f9413481a2cb936538de7f6567537f57510efc31 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Answer Given options_zhmt", + "evaluation": { + "accuracy": 0.9060225016545335 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..3848e3b6010638e5baa3605adcb2ed4e7565e872 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Choose Story Ending_zhmt", + "evaluation": { + "accuracy": 0.9238914626075446 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..82388d3615260ef92a1a29f90aa5a65ccb88b2ef --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Generate Ending_zhmt", + "evaluation": { + "accuracy": 0.686962276637988 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..54c1d2c5dd81fa11a302a50c6d963e2920425905 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Novel Correct Ending_zhmt", + "evaluation": { + "accuracy": 0.9185969556585043 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..ab07c990e89ee646528b8a9ec96060f38b3f9799 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xstory_cloze", + "dataset_config_name": "zh", + "template_name": "Story Continuation and Options_zhmt", + "evaluation": { + "accuracy": 0.9192587690271343 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..a3372322aac8e5d3108f503eec3b6d9957b1eff1 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/Replace_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "Replace_frmt", + "evaluation": { + "accuracy": 0.6506024096385542 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..d736e10d0ad8f235c8626b055de1b0f8e01ac5bc --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "True or False_frmt", + "evaluation": { + "accuracy": 0.5662650602409639 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..3892016acc8c698db5e40b53f12b2283e243512d --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "does underscore refer to_frmt", + "evaluation": { + "accuracy": 0.5542168674698795 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..70e2893ab1283473fe670d6b9c1559319c97ad14 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "stand for_frmt", + "evaluation": { + "accuracy": 0.4819277108433735 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..9dbde34f02fcdaff994a777d2ace720425c36a96 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "fr", + "template_name": "underscore refer to_frmt", + "evaluation": { + "accuracy": 0.6144578313253012 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..8d395d02cb45165e2739532fcd583fec2b67d810 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "Replace_ptmt", + "evaluation": { + "accuracy": 0.6425855513307985 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..e9efc4706c4ab11835d68991128cc5decbda26a1 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "True or False_ptmt", + "evaluation": { + "accuracy": 0.49809885931558934 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..3a633238a24aeb37cc8331ccb4e449cfde8fe687 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "does underscore refer to_ptmt", + "evaluation": { + "accuracy": 0.6045627376425855 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..83427e2277d192918a6bad4b84f9513d8377c356 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "stand for_ptmt", + "evaluation": { + "accuracy": 0.5095057034220533 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..df1a9dea52cf887e4794c7c459663da099890361 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "pt", + "template_name": "underscore refer to_ptmt", + "evaluation": { + "accuracy": 0.6273764258555133 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..521b05f635446268ac7e469eb44c341e724df736 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "Replace_zhmt", + "evaluation": { + "accuracy": 0.6845238095238095 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..babb0d1084f0781ee2f6990cc6963fecdff81cce --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "True or False_zhmt", + "evaluation": { + "accuracy": 0.503968253968254 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..076524ebe2c67a402da793a9ab9af6bd4a86cef3 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "does underscore refer to_zhmt", + "evaluation": { + "accuracy": 0.626984126984127 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..78853a372526204eafbeaec7f53bdebd663676c1 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "stand for_zhmt", + "evaluation": { + "accuracy": 0.503968253968254 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..af13eaee7579fc491fc6899915af192a937c7580 --- /dev/null +++ b/evaluation_xcopawinostorymt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "Muennighoff/xwinograd", + "dataset_config_name": "zh", + "template_name": "underscore refer to_zhmt", + "evaluation": { + "accuracy": 0.7023809523809523 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/merged.csv b/evaluation_xcopawinostorymt/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..24d3f440c11ab4c7ff93650308d34960b1bae599 --- /dev/null +++ b/evaluation_xcopawinostorymt/merged.csv @@ -0,0 +1,86 @@ +dataset,prompt,metric,value +xcopa_id,C1 or C2? premise_idmt,accuracy,0.52 +xcopa_id,best_option_idmt,accuracy,0.73 +xcopa_id,cause_effect_idmt,accuracy,0.82 +xcopa_id,i_am_hesitating_idmt,accuracy,0.76 +xcopa_id,plausible_alternatives_idmt,accuracy,0.78 +xcopa_id,median,accuracy,0.76 +xcopa_sw,C1 or C2? premise_swmt,accuracy,0.59 +xcopa_sw,best_option_swmt,accuracy,0.62 +xcopa_sw,cause_effect_swmt,accuracy,0.64 +xcopa_sw,i_am_hesitating_swmt,accuracy,0.63 +xcopa_sw,plausible_alternatives_swmt,accuracy,0.64 +xcopa_sw,median,accuracy,0.63 +xcopa_ta,C1 or C2? premise_tamt,accuracy,0.6 +xcopa_ta,best_option_tamt,accuracy,0.47 +xcopa_ta,cause_effect_tamt,accuracy,0.62 +xcopa_ta,i_am_hesitating_tamt,accuracy,0.64 +xcopa_ta,plausible_alternatives_tamt,accuracy,0.63 +xcopa_ta,median,accuracy,0.62 +xcopa_vi,C1 or C2? premise_vimt,accuracy,0.56 +xcopa_vi,best_option_vimt,accuracy,0.77 +xcopa_vi,cause_effect_vimt,accuracy,0.84 +xcopa_vi,i_am_hesitating_vimt,accuracy,0.82 +xcopa_vi,plausible_alternatives_vimt,accuracy,0.84 +xcopa_vi,median,accuracy,0.82 +xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.62 +xcopa_zh,best_option_zhmt,accuracy,0.72 +xcopa_zh,cause_effect_zhmt,accuracy,0.89 +xcopa_zh,i_am_hesitating_zhmt,accuracy,0.9 +xcopa_zh,plausible_alternatives_zhmt,accuracy,0.86 +xcopa_zh,median,accuracy,0.86 +xstory_cloze_ar,Answer Given options_armt,accuracy,0.8669755129053607 +xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.927862342819325 +xstory_cloze_ar,Generate Ending_armt,accuracy,0.6479152878888154 +xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.9185969556585043 +xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.9113170086035738 +xstory_cloze_ar,median,accuracy,0.9113170086035738 +xstory_cloze_es,Answer Given options_esmt,accuracy,0.9272005294506949 +xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.9437458636664461 +xstory_cloze_es,Generate Ending_esmt,accuracy,0.7445400397088021 +xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.9397749834546658 +xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.9298477829252151 +xstory_cloze_es,median,accuracy,0.9298477829252151 +xstory_cloze_eu,Answer Given options_eumt,accuracy,0.7452018530774321 +xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.8676373262739907 +xstory_cloze_eu,Generate Ending_eumt,accuracy,0.6082064857710126 +xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.8219722038385175 +xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.814030443414957 +xstory_cloze_eu,median,accuracy,0.814030443414957 +xstory_cloze_hi,Answer Given options_himt,accuracy,0.8266048974189278 +xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.8841826604897419 +xstory_cloze_hi,Generate Ending_himt,accuracy,0.657180675049636 +xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.8669755129053607 +xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.8689609530112509 +xstory_cloze_hi,median,accuracy,0.8669755129053607 +xstory_cloze_id,Answer Given options_idmt,accuracy,0.8616810059563204 +xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.914626075446724 +xstory_cloze_id,Generate Ending_idmt,accuracy,0.6730641958967571 +xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.8954334877564527 +xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.8927862342819325 +xstory_cloze_id,median,accuracy,0.8927862342819325 +xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.9060225016545335 +xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.9238914626075446 +xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.686962276637988 +xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.9185969556585043 +xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.9192587690271343 +xstory_cloze_zh,median,accuracy,0.9185969556585043 +xwinograd_fr,Replace_frmt,accuracy,0.6506024096385542 +xwinograd_fr,True or False_frmt,accuracy,0.5662650602409639 +xwinograd_fr,does underscore refer to_frmt,accuracy,0.5542168674698795 +xwinograd_fr,stand for_frmt,accuracy,0.4819277108433735 +xwinograd_fr,underscore refer to_frmt,accuracy,0.6144578313253012 +xwinograd_fr,median,accuracy,0.5662650602409639 +xwinograd_pt,Replace_ptmt,accuracy,0.6425855513307985 +xwinograd_pt,True or False_ptmt,accuracy,0.49809885931558934 +xwinograd_pt,does underscore refer to_ptmt,accuracy,0.6045627376425855 +xwinograd_pt,stand for_ptmt,accuracy,0.5095057034220533 +xwinograd_pt,underscore refer to_ptmt,accuracy,0.6273764258555133 +xwinograd_pt,median,accuracy,0.6045627376425855 +xwinograd_zh,Replace_zhmt,accuracy,0.6845238095238095 +xwinograd_zh,True or False_zhmt,accuracy,0.503968253968254 +xwinograd_zh,does underscore refer to_zhmt,accuracy,0.626984126984127 +xwinograd_zh,stand for_zhmt,accuracy,0.503968253968254 +xwinograd_zh,underscore refer to_zhmt,accuracy,0.7023809523809523 +xwinograd_zh,median,accuracy,0.626984126984127 +multiple,average,multiple,0.7729547044755157 diff --git a/evaluation_xcopawinostorymt/merged.json b/evaluation_xcopawinostorymt/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..475ec7d320796d8b63ccd3ab6fb46dfd89c597f9 --- /dev/null +++ b/evaluation_xcopawinostorymt/merged.json @@ -0,0 +1 @@ +{"Muennighoff/xstory_cloze_ar": {"Answer Given options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8669755129053607}, "template_name": "Answer Given options_armt"}, "Choose Story Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.927862342819325}, "template_name": "Choose Story Ending_armt"}, "Generate Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6479152878888154}, "template_name": "Generate Ending_armt"}, "Novel Correct Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9185969556585043}, "template_name": "Novel Correct Ending_armt"}, "Story Continuation and Options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9113170086035738}, "template_name": "Story Continuation and Options_armt"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9272005294506949}, "template_name": "Answer Given options_esmt"}, "Choose Story Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9437458636664461}, "template_name": "Choose Story Ending_esmt"}, "Generate Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7445400397088021}, "template_name": "Generate Ending_esmt"}, "Novel Correct Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9397749834546658}, "template_name": "Novel Correct Ending_esmt"}, "Story Continuation and Options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9298477829252151}, "template_name": "Story Continuation and Options_esmt"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7452018530774321}, "template_name": "Answer Given options_eumt"}, "Choose Story Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8676373262739907}, "template_name": "Choose Story Ending_eumt"}, "Generate Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6082064857710126}, "template_name": "Generate Ending_eumt"}, "Novel Correct Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8219722038385175}, "template_name": "Novel Correct Ending_eumt"}, "Story Continuation and Options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.814030443414957}, "template_name": "Story Continuation and Options_eumt"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8266048974189278}, "template_name": "Answer Given options_himt"}, "Choose Story Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8841826604897419}, "template_name": "Choose Story Ending_himt"}, "Generate Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.657180675049636}, "template_name": "Generate Ending_himt"}, "Novel Correct Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8669755129053607}, "template_name": "Novel Correct Ending_himt"}, "Story Continuation and Options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8689609530112509}, "template_name": "Story Continuation and Options_himt"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8616810059563204}, "template_name": "Answer Given options_idmt"}, "Choose Story Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.914626075446724}, "template_name": "Choose Story Ending_idmt"}, "Generate Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6730641958967571}, "template_name": "Generate Ending_idmt"}, "Novel Correct Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8954334877564527}, "template_name": "Novel Correct Ending_idmt"}, "Story Continuation and Options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8927862342819325}, "template_name": "Story Continuation and Options_idmt"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9060225016545335}, "template_name": "Answer Given options_zhmt"}, "Choose Story Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9238914626075446}, "template_name": "Choose Story Ending_zhmt"}, "Generate Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.686962276637988}, "template_name": "Generate Ending_zhmt"}, "Novel Correct Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9185969556585043}, "template_name": "Novel Correct Ending_zhmt"}, "Story Continuation and Options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.9192587690271343}, "template_name": "Story Continuation and Options_zhmt"}}, "Muennighoff/xwinograd_fr": {"Replace_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6506024096385542}, "template_name": "Replace_frmt"}, "True or False_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5662650602409639}, "template_name": "True or False_frmt"}, "does underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5542168674698795}, "template_name": "does underscore refer to_frmt"}, "stand for_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4819277108433735}, "template_name": "stand for_frmt"}, "underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6144578313253012}, "template_name": "underscore refer to_frmt"}}, "Muennighoff/xwinograd_pt": {"Replace_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6425855513307985}, "template_name": "Replace_ptmt"}, "True or False_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49809885931558934}, "template_name": "True or False_ptmt"}, "does underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6045627376425855}, "template_name": "does underscore refer to_ptmt"}, "stand for_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5095057034220533}, "template_name": "stand for_ptmt"}, "underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6273764258555133}, "template_name": "underscore refer to_ptmt"}}, "Muennighoff/xwinograd_zh": {"Replace_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6845238095238095}, "template_name": "Replace_zhmt"}, "True or False_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.503968253968254}, "template_name": "True or False_zhmt"}, "does underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.626984126984127}, "template_name": "does underscore refer to_zhmt"}, "stand for_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.503968253968254}, "template_name": "stand for_zhmt"}, "underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.7023809523809523}, "template_name": "underscore refer to_zhmt"}}, "xcopa_id": {"C1 or C2? premise_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.52}, "template_name": "C1 or C2? premise_idmt"}, "best_option_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.73}, "template_name": "best_option_idmt"}, "cause_effect_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.82}, "template_name": "cause_effect_idmt"}, "i_am_hesitating_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.76}, "template_name": "i_am_hesitating_idmt"}, "plausible_alternatives_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.78}, "template_name": "plausible_alternatives_idmt"}}, "xcopa_sw": {"C1 or C2? premise_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "C1 or C2? premise_swmt"}, "best_option_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "best_option_swmt"}, "cause_effect_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "cause_effect_swmt"}, "i_am_hesitating_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "i_am_hesitating_swmt"}, "plausible_alternatives_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "plausible_alternatives_swmt"}}, "xcopa_ta": {"C1 or C2? premise_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_tamt"}, "best_option_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.47}, "template_name": "best_option_tamt"}, "cause_effect_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "cause_effect_tamt"}, "i_am_hesitating_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "i_am_hesitating_tamt"}, "plausible_alternatives_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "plausible_alternatives_tamt"}}, "xcopa_vi": {"C1 or C2? premise_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "C1 or C2? premise_vimt"}, "best_option_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "best_option_vimt"}, "cause_effect_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "cause_effect_vimt"}, "i_am_hesitating_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.82}, "template_name": "i_am_hesitating_vimt"}, "plausible_alternatives_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.84}, "template_name": "plausible_alternatives_vimt"}}, "xcopa_zh": {"C1 or C2? premise_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "C1 or C2? premise_zhmt"}, "best_option_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "best_option_zhmt"}, "cause_effect_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.89}, "template_name": "cause_effect_zhmt"}, "i_am_hesitating_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.9}, "template_name": "i_am_hesitating_zhmt"}, "plausible_alternatives_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.86}, "template_name": "plausible_alternatives_zhmt"}}} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..2d18fed5573d9e7431c3cd9487581f8c564ab578 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/id/C1_or_C2?_premise_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "C1 or C2? premise_idmt", + "evaluation": { + "accuracy": 0.52 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..f233ae5c456777aac5b1417d10ffd542c87ab48a --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/id/best_option_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "best_option_idmt", + "evaluation": { + "accuracy": 0.73 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..f3ddeb503bac6e9b74ab9f16ac0bd697979bfeaa --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/id/cause_effect_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "cause_effect_idmt", + "evaluation": { + "accuracy": 0.82 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..f6152a3d3f369ace4c26504a34ebf357c7b8bace --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/id/i_am_hesitating_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "i_am_hesitating_idmt", + "evaluation": { + "accuracy": 0.76 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json b/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..725e426182a3238175ceaa5ab7cbab0b1e7f8111 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/id/plausible_alternatives_idmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "id", + "template_name": "plausible_alternatives_idmt", + "evaluation": { + "accuracy": 0.78 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..2c7a00d76a6333bc34aa6601d2e68015df48f449 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/sw/C1_or_C2?_premise_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "C1 or C2? premise_swmt", + "evaluation": { + "accuracy": 0.59 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..173ef6de7e74ff2b193350e7ba10a9baac445972 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/sw/best_option_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "best_option_swmt", + "evaluation": { + "accuracy": 0.62 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..1dafce6a4ea8d20602aa5dca2eccd5084fdd8fae --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/sw/cause_effect_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "cause_effect_swmt", + "evaluation": { + "accuracy": 0.64 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..2f87f621ae05e69755b9c99d53b581ca5404b6c6 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/sw/i_am_hesitating_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "i_am_hesitating_swmt", + "evaluation": { + "accuracy": 0.63 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json b/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..4e920992048a1dd677f8efe583fd6028eda0eb81 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/sw/plausible_alternatives_swmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "sw", + "template_name": "plausible_alternatives_swmt", + "evaluation": { + "accuracy": 0.64 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..36cc67d63cc026f3fd5618fed9096f2db12a236f --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/ta/C1_or_C2?_premise_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "C1 or C2? premise_tamt", + "evaluation": { + "accuracy": 0.6 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..7729bbe69f7d2120564bd7f292583d853483e788 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/ta/best_option_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "best_option_tamt", + "evaluation": { + "accuracy": 0.47 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..d460b77dca312d4b564a3742cfe47ae5fd764aa0 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/ta/cause_effect_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "cause_effect_tamt", + "evaluation": { + "accuracy": 0.62 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..9c65367a97ed502719e5a6416c0976e4d79c375e --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/ta/i_am_hesitating_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "i_am_hesitating_tamt", + "evaluation": { + "accuracy": 0.64 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json b/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..626d419e0b61a3931438550cfc2584491f93705a --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/ta/plausible_alternatives_tamt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "ta", + "template_name": "plausible_alternatives_tamt", + "evaluation": { + "accuracy": 0.63 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..0798cf5718e3c2fa96370497cecbc51ceafb6023 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/vi/C1_or_C2?_premise_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "C1 or C2? premise_vimt", + "evaluation": { + "accuracy": 0.56 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..51c5211271e532115de9f6764b8efda253123fa4 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/vi/best_option_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "best_option_vimt", + "evaluation": { + "accuracy": 0.77 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..b5d213a526385f21ab89b0b3ce9bb37d56d50396 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/vi/cause_effect_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "cause_effect_vimt", + "evaluation": { + "accuracy": 0.84 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..e9f4e840960efac6d65c98d80f1590f42689fe9d --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/vi/i_am_hesitating_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "i_am_hesitating_vimt", + "evaluation": { + "accuracy": 0.82 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json b/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..652ee68654626dd8e0e6328ad034d49da9c7dec0 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/vi/plausible_alternatives_vimt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "vi", + "template_name": "plausible_alternatives_vimt", + "evaluation": { + "accuracy": 0.84 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..265db38422ff85e5df5a9f3e67b378c378ba07a4 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "C1 or C2? premise_zhmt", + "evaluation": { + "accuracy": 0.62 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..c887673cfe5637380b482de72f660776fbeb4557 --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/zh/best_option_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "best_option_zhmt", + "evaluation": { + "accuracy": 0.72 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..5d9ef41457058926e54c6b244dcfff0e81e8bb6f --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/zh/cause_effect_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "cause_effect_zhmt", + "evaluation": { + "accuracy": 0.89 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..63a9404e70cf90d3f6a89bb6794fb478e372cdda --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/zh/i_am_hesitating_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "i_am_hesitating_zhmt", + "evaluation": { + "accuracy": 0.9 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file diff --git a/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json b/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json new file mode 100644 index 0000000000000000000000000000000000000000..953e0df3d4da3f6375f232d68a6bf67b73cef1bc --- /dev/null +++ b/evaluation_xcopawinostorymt/xcopa/zh/plausible_alternatives_zhmt/results.json @@ -0,0 +1,9 @@ +{ + "dataset_name": "xcopa", + "dataset_config_name": "zh", + "template_name": "plausible_alternatives_zhmt", + "evaluation": { + "accuracy": 0.86 + }, + "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='bfloat16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/176bt0/bloomz/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)" +} \ No newline at end of file