diff --git a/.gitattributes b/.gitattributes index 9c24676beb68851799956f374928f1ad551c37b3..55efb38eafb56f2dd0719200f81894ef8245422e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -177,3 +177,51 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 8b7178b178b/evaluation/generation/examples.8b7178b178b_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text 8b7178b178b/evaluation/generation/examples.8b7178b178b_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text 8b7178b178b/evaluation/generation/examples.8b7178b178b_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..861f974eeac161f474a44020aa41ed3d3d4a9b06 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4373157998206226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.056741486001354216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07630675677312203, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021365872076980082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.2947830429134024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047665492114514684}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.10947380559420297, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022511546576554292}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03437396040114199, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011646295650624351}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.13907462394894776, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031715068123442996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.051001464150726095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014146940985046523}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07227253003813615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001955466598501427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.28336049631237586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004569573466829386}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10417993615217146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002043414019393663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07290032072974349, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002034641349821296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2825727583996548, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004541613699421234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10462419732455051, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002115527416565905}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a9ec0a57b8354b510ead04d5de356e87e3cadb7e --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5816764130686213, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.054147923323820346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07783086970109386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001926390801136041}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36711372145643695, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005207087210896133}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11727270364668571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021476260743326565}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035941971570071736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011116447277521812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17604536516651662, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0036353372642678756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05455324665630359, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013665856855669073}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07209000868611247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017559014471748775}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33901675935925657, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004687996190515175}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10871994085882164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019153699814912425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0735230628158407, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018435905984185187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.34433610068323095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004792623835429767}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11053867463797286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020173857832705686}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6fb91afeb3ce1148a225b5256529d72363b8c813 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6801735381749798, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04514863078336761}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07666591908388082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015425382902699712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.40028519820196484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0053218199498182555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11972585427910692, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001963983917319172}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03562073396147935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009954366405729978}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.19724892000097954, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004019172908229666}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.055583861631680276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012687258801338563}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07037367893965144, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013558489579657599}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3648716666842121, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004723267009132289}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10999138993564177, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017648420950003858}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07198504559682002, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014532243063151847}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3737640227903558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004892612299725926}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1122905686653009, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018425170251459826}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..13ff18c86cb511025348204831e48f067d81f83b --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.8393015493701177, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049868510474836}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07934496701755703, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017108996646152186}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.41957311539245606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005383968460349908}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12332290364001403, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020213255478620834}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03700401446806371, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010754080423489075}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20785188427731804, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0039850136524860235}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05769919253564089, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001315453385968732}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07186075331384528, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015065682717614625}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3792930717112658, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004756348746075166}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11171273283361995, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017620828691461753}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07439776049493438, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016078356678010679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.39147213524665503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004968164479358761}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11543602475589927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001888123216509075}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1263e84c5629dfdbe72dbe25643856f57aca8aa4 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.8436391155168642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.049789928911136136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08188000384427346, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015429926759057174}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.45225212783083296, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005452056426418837}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12951808355992003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019403013558157668}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03789712819393572, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009152136077591718}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.22689495723762804, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004067188852506108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06050302495157815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012369724150331927}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07330729359090057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013644419423480823}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.4045943171397325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0048320778663538955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11592286526905457, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016906240203803347}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07637812548771711, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014540942217052375}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.4207633762253156, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.005005614020197891}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12060522562383363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018022149452879743}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..afbf69a82bd99bc9a43561f1680507437984b2dd --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.968801645723095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05462868115593731}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08408541082259362, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001415159971957011}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4730441639084742, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005431978662817184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13468892691608006, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019240731337134673}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.038803485654806714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008788058278028121}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2379631167113304, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004152278672512853}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.06252454636994373, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012377179103821956}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07423644311431606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012142160071926726}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.4194644730807224, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0047799777603664715}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11896213494013362, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001650634135835398}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07800322392434518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013202039315876929}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.43851085762444947, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004968418620592047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12479160570822044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017844920178026805}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..242fae1979f78c855ef9ddf32c98ab336fd59987 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1628352267958049, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019076289743005436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28137034403026634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002726139405286867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19188833167040015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018702943944022418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.035168122195033014, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008051014260490183}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06371619685684557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015314676134498058}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04191366659253858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000917658619128283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12246239102421737, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001285311648356421}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2201424968967575, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022006617198564397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14614932555210797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012956973040530264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.14960133985735732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017445585328390875}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2600296050390341, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002552101400202146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17662543049894286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001716560474974557}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.8310798418084737, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06119305662626742}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..89f32caa36fd8c3ee75bcc6e3a81de12a627cee0 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17507251962841255, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019979193178610104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3008600636001003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002760929751246215}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20607218590152535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019348192676270253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.040814840621502924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008769075552676463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07239534384858555, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015745819458817655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04827166271383884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000981638580890638}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12604513290190442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001331288983523961}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.22410236240350495, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021252698749924166}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14993882097099004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013143528095662098}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1634506211607229, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018573519091366366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.282240921650429, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002619133620350951}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19267274247494454, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018042087124662393}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.463691563435929, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.056238669871763534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..07245b00c863e58761a4c09ac494c1fcc144d843 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.3016960813220789, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04082243698870125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.0628362517057776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006647851313912593}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.137799699754724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013399199497499206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.08446489060239885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000832880994211257}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.009897778438587298, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002813167869752497}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.02030644827526888, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005626168002295672}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.013098749477153882, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00036393407871382073}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.060894085509621707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006165662598640652}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.1334600824136164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012622226683087598}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.08185345358329085, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007758077715250189}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.05822489106803366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00059732648786741}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.12759747263850774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011920315451119745}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.07821933484300712, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000742258462877918}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c523db860077e831e24df239bb0a685733f7ea30 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 9.014911506469076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10639404635916568}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.4220547738164344, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029747898853502135}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.44890682882262095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030482256607117985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4071860181343587, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002172401493926826}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.18398717679998686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002070314538395242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19695708046780447, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021253468809232486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17675735102863532, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016983524897307275}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.301979337271323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002426170411020153}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.32120889976820555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024556761259021593}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2901955244768342, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017445755551563752}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3498631569304881, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00275693533807641}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3704033862486897, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00277359280896307}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3366538075182426, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020840116286440125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..15db004d4aa99c7897c1c28387b487529c853355 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.1355912496663, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13104161087327487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.45419518183344554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027266345121146297}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.48044140772752547, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002817950150203246}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4449026725503146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020303864410380614}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2124884501715128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020421137596173576}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.22663601203825093, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021799430214164035}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.20791813280930055, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017732074653438918}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.32873823752244813, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002262430136729236}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34880236417169236, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024064753259342344}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.32200623778436555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017788348099423016}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.38047010773438, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002559842275120574}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40224234667337105, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002660545444016772}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.37259609212766276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020305961255707534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..724dce62f172eb6f393ec13619b4906e06a285de --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 12.10470203084112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.160895558301053}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.45456971607102353, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023960763531178423}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.47953391372338766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027865520920584822}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4492382386896765, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019699301602619833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2163784140584202, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001980036007401307}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23028758666497792, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022756293438486903}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.2138339925008178, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018445417129634963}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.33426681511660167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002069523940604892}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35354783483337526, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024228991972543336}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.3303919839118561, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001783388701966232}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3827360863233357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002359678766192712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40399025031746394, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002710450542079631}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.378239342272828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002043462221499771}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b59237d0c0d384891ba88f6b11a9a7f8fd0f04b6 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 12.44085622955195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13067375364343567}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.45479182860495637, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023107278825576553}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.48017725960351876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027316537664281945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4511875093908521, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001948885960105966}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.21833473257724795, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019502917664114363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.23333710855434647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002269432186215575}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.21707781052843164, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00185164014349205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3366370287941819, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019965287497770777}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35684028344910274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024057773390550254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.33434874026181494, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017786642192651214}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.38571523251054607, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022865314323742805}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40803304829345277, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027038368009947755}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.38297835064075053, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020438066934495387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4d6a9efd57b9a41d9dc660181d857bf82ad1b6de --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 12.352678257477072, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1229858870702429}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.4536679355694742, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002282805690194143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.48257839264210245, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002694452362144022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4520495564262539, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019091091728356577}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.2184937204533077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019084340007320115}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2348008779716612, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022303214323899125}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.21807002700335165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018243298667958761}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.33692930278278704, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019923203298548866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.36002100624003847, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024053149539646364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.336313193439231, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001791714678974818}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.38599958036488224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002272531904720447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.4113378384062774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002667546056299814}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.38490335194628533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002018489316229454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_0.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..86e22282cb6ec1ed3147295a424849271c434e16 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.15954668519186138, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002100617390256336}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.36309739763041776, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004368615445871372}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2157374708631877, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002508783327476019}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03857312620535916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012905906114606838}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.09177566784631346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0029866334470645636}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05279448511938101, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001664044690700437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11839547499241576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001646949604960197}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.27198898577078495, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035222426961979128}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1604319788556621, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019556284658093183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1250746587410693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017464873888176696}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.28828909491111, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0038785736556512756}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1697976651408494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002135233558113895}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.194933977793404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09650169688197711}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_1.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5200f1c2007ab17b42d86e7f358388bd1fefedf2 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.130085376048313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019273226219467348}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.31899443410148204, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004375864955378675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18261422772647087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00258517997013667}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.027035766317831913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010834329723279196}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06843246331973844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002783755254166952}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03829439148207006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015268296315399182}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.09860861520075329, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014180838773530536}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2439616017337858, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003388978319546548}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.13872448607191418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019214547149178605}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10431817474650688, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015880489605249845}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.25778306821268704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037812899736781526}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14673205236069722, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002156691093638724}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.638275312642142, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11197811159644402}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_2.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..720e848d767777b2f3fc7e2e80a6f880ababd91e --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14059986141007405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001980010991959053}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.34704590818514364, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004609783640663682}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.1978001315070318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002669936318830053}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.033448694107769246, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011984145262489562}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08605872654054372, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003191665944600775}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04760922765448965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001702755511678876}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10721276455620249, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014796729903083395}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26727650609085896, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036995069904560843}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1511855886298728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020261890172370246}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11346372550782761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016303825401644061}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.28288787023938955, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004038388892533831}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16004925294103853, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022364087298009884}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.0210914476360786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12454216962651855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..28ee79981c9d10e71ec4577328f5f963bb531873 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75f8a7429d4fa990fc951a1eb232a50cdfe103ebc4f4333f2172fdf37f20c0ec +size 4145663 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7db35f3405799b3d311fba2d32c06a95d0e47c95 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8198ef33953196c4beed6a0eb43940e14ec631b3599f985f4f948213a397bf5 +size 5175400 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9336634c030c5b6df5dcfe9c7f117ebc50c99130 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede4ec95ed7dbcd239fc5145e4c183de6edf7c4730fa59b8a15767964b2898e1 +size 6090106 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b8ebfc342b7b5919d66537186910274781ef7d14 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b886146a6a2b25a0004d71a5ffcf0ded8cfd777be395c92cdc43eda4e11e84e3 +size 7018891 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6986e2ce6cf3c2c77439c7ec12876ab5649c658c --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5460e9e85d6abd142df975a4be1d4d38d3ea658d547750d1be20ba6f8260897e +size 7906446 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b19414c4aff977e52ec5f6b5c38b2a7e883e501f --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92361c08ce0393304401ae18656799a36fd64cdd095286e241874604253293e6 +size 8802413 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d19a9d757dcff21896e58dd348897b2f277605c3 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7812e51bc8a7484b2e16a2713ff42a9f1901d1a61db775f40ee84e41fd3a604f +size 7699465 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..830dd7b14071defe6218e74f35c6f6e67e0449d4 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cd79c8d0c5f8cbfbeb3c2542d489234b1c40a486e29cbc2cee637058b380710 +size 13315258 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ad47f52b5dffeda682b8f5dd0c897026272020b5 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1abdb8dadd3b1fe934e1a1ccbbfa0ea4ca5656b2d16e5ff342efcbe5621eabd2 +size 4450074 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c5c7b732cefc6018fa7bdc3e3889542bf690bd2e --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:130dc53c9e5caf7253adc9ce05994f043dee9bfaa1d249fa814e50808dd0f69c +size 5179876 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..51ae2a19d189d2297dca383695ce543c08a720e5 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:135e1879d04e18711c109c15cea363ed607a6ec9fbc1daf46dbe6ebd0a4f1e08 +size 6247622 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..75fa7d2c24ba91a8ee8956eba39e026129a052aa --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d2e2a0f6b09d08e84b83ae3548a2499c56aca470925d5ff3ff61b0ae00a0252 +size 7310505 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..242ebcba864264efc0ea681edddc8fbc4551db0d --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c1b15d54c73679ec97ca5f57bab183661dd20887d21bf8c9f8f1efd0abbadfd +size 8385150 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..617268f5cb0eb9fa5aa4f53020e03a69282ebbc7 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:617ed30496c1be4389b080e6e664675b18bb04395264dad1e9e369db533f2a6c +size 9476118 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_0.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..42d882ccba59671e7ea3340953d5021b332ec98a --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0313d23cd7923b59e2c54d30e60aaf4ef2c75255c88c80bf65e42df6956e07bb +size 2823747 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_1.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5d436f6c828fbac946fcb3359d435771372567c3 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4493325c3cc1255da093dcc419ad34b7385ad37f68b3849cf8562b4c6bc09b4 +size 5103294 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_2.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..689e1133b9dfd13a1abd631d3469b5c108cb56d5 --- /dev/null +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa161eabcf632abf8a3a18136f127cde3ecf7519a9319ca940666fd5ab166aa +size 7378211 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0129df45eed2ec66dffc528ef27134b36390e7b5 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4373157998206226, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.056741486001354216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07630675677312203, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021365872076980082 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.2947830429134024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0047665492114514684 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10947380559420297, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022511546576554292 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03437396040114199, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011646295650624351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.13907462394894776, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0031715068123442996 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.051001464150726095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014146940985046523 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07227253003813615, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001955466598501427 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.28336049631237586, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004569573466829386 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10417993615217146, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002043414019393663 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07290032072974349, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002034641349821296 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2825727583996548, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004541613699421234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10462419732455051, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002115527416565905 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..60c0a1fe4e2248c9cbe31379cb8b848b0cfa2728 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5816764130686213, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.054147923323820346 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07783086970109386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001926390801136041 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.36711372145643695, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005207087210896133 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11727270364668571, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021476260743326565 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035941971570071736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011116447277521812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17604536516651662, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0036353372642678756 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05455324665630359, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013665856855669073 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07209000868611247, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017559014471748775 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33901675935925657, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004687996190515175 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10871994085882164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019153699814912425 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0735230628158407, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018435905984185187 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.34433610068323095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004792623835429767 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11053867463797286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020173857832705686 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec40008415d462a3091339618a4cdb147740456 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6801735381749798, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04514863078336761 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07666591908388082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015425382902699712 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.40028519820196484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0053218199498182555 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11972585427910692, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001963983917319172 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03562073396147935, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009954366405729978 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.19724892000097954, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004019172908229666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.055583861631680276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012687258801338563 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07037367893965144, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013558489579657599 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3648716666842121, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004723267009132289 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10999138993564177, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017648420950003858 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07198504559682002, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014532243063151847 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3737640227903558, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004892612299725926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1122905686653009, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018425170251459826 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9eaa106978424f88c613e6feab1990be07b1f1b5 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.8393015493701177, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.049868510474836 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07934496701755703, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017108996646152186 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.41957311539245606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005383968460349908 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12332290364001403, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020213255478620834 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03700401446806371, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010754080423489075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20785188427731804, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0039850136524860235 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05769919253564089, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001315453385968732 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07186075331384528, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015065682717614625 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3792930717112658, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004756348746075166 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11171273283361995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017620828691461753 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07439776049493438, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016078356678010679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.39147213524665503, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004968164479358761 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11543602475589927, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001888123216509075 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac8372d295e63a979c9b6d99b44524240a6e947 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.8436391155168642, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.049789928911136136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08188000384427346, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015429926759057174 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.45225212783083296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005452056426418837 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12951808355992003, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019403013558157668 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03789712819393572, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009152136077591718 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.22689495723762804, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004067188852506108 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06050302495157815, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012369724150331927 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07330729359090057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013644419423480823 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.4045943171397325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0048320778663538955 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11592286526905457, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016906240203803347 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07637812548771711, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014540942217052375 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.4207633762253156, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005005614020197891 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12060522562383363, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018022149452879743 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f306e46c88c1212f2e08c302c340b32458c0d91d --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.968801645723095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05462868115593731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08408541082259362, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001415159971957011 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4730441639084742, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005431978662817184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13468892691608006, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019240731337134673 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.038803485654806714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008788058278028121 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2379631167113304, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004152278672512853 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06252454636994373, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012377179103821956 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07423644311431606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012142160071926726 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.4194644730807224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0047799777603664715 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11896213494013362, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001650634135835398 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07800322392434518, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013202039315876929 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.43851085762444947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004968418620592047 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12479160570822044, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017844920178026805 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..de4131bae6090790e6a6e627c0de7516f37a263e --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1628352267958049, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019076289743005436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28137034403026634, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002726139405286867 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.19188833167040015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018702943944022418 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.035168122195033014, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008051014260490183 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06371619685684557, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015314676134498058 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04191366659253858, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000917658619128283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12246239102421737, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001285311648356421 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.2201424968967575, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022006617198564397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14614932555210797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012956973040530264 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.14960133985735732, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017445585328390875 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2600296050390341, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002552101400202146 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17662543049894286, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001716560474974557 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.8310798418084737, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06119305662626742 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ccce1eb11a88ad2bf28961628ec2308884b54583 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17507251962841255, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019979193178610104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3008600636001003, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002760929751246215 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.20607218590152535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019348192676270253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.040814840621502924, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008769075552676463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07239534384858555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015745819458817655 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04827166271383884, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000981638580890638 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12604513290190442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001331288983523961 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.22410236240350495, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021252698749924166 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14993882097099004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013143528095662098 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1634506211607229, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018573519091366366 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.282240921650429, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002619133620350951 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19267274247494454, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018042087124662393 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.463691563435929, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.056238669871763534 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..830da6644a2d3a2788269bbf5d58afabb332d3eb --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.3016960813220789, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.04082243698870125 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.0628362517057776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0006647851313912593 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.137799699754724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0013399199497499206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.08446489060239885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.000832880994211257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.009897778438587298, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0002813167869752497 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.02030644827526888, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0005626168002295672 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.013098749477153882, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00036393407871382073 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.060894085509621707, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0006165662598640652 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.1334600824136164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0012622226683087598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.08185345358329085, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0007758077715250189 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.05822489106803366, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00059732648786741 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.12759747263850774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0011920315451119745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.07821933484300712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000742258462877918 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b84ba9097759a6ef3c5dc348993a21c461992866 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.014911506469076, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10639404635916568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4220547738164344, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029747898853502135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.44890682882262095, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0030482256607117985 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4071860181343587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002172401493926826 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.18398717679998686, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002070314538395242 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19695708046780447, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021253468809232486 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17675735102863532, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016983524897307275 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.301979337271323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002426170411020153 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.32120889976820555, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024556761259021593 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2901955244768342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017445755551563752 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3498631569304881, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00275693533807641 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3704033862486897, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00277359280896307 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3366538075182426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020840116286440125 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e6578da5b9343fac312eed7a233ec47886c5c1cd --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.1355912496663, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.13104161087327487 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.45419518183344554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0027266345121146297 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48044140772752547, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002817950150203246 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4449026725503146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020303864410380614 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2124884501715128, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0020421137596173576 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.22663601203825093, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021799430214164035 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.20791813280930055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017732074653438918 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.32873823752244813, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002262430136729236 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.34880236417169236, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024064753259342344 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.32200623778436555, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017788348099423016 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.38047010773438, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002559842275120574 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40224234667337105, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002660545444016772 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.37259609212766276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020305961255707534 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ba94275d4b2c531c8c17c3b9f0486bb7315152fe --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 12.10470203084112, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.160895558301053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.45456971607102353, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0023960763531178423 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.47953391372338766, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027865520920584822 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4492382386896765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019699301602619833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2163784140584202, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001980036007401307 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.23028758666497792, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022756293438486903 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2138339925008178, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018445417129634963 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.33426681511660167, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002069523940604892 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.35354783483337526, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024228991972543336 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3303919839118561, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001783388701966232 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3827360863233357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002359678766192712 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40399025031746394, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002710450542079631 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.378239342272828, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002043462221499771 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..51501f24602cedefee090c66612d4638d4c5e4c2 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 12.44085622955195, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.13067375364343567 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.45479182860495637, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0023107278825576553 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48017725960351876, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027316537664281945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4511875093908521, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001948885960105966 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.21833473257724795, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019502917664114363 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.23333710855434647, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002269432186215575 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.21707781052843164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00185164014349205 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3366370287941819, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0019965287497770777 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.35684028344910274, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024057773390550254 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.33434874026181494, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017786642192651214 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.38571523251054607, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0022865314323742805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40803304829345277, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027038368009947755 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.38297835064075053, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020438066934495387 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c0e38d7ee591381ab3343c7228b8fd8a4f3459 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 12.352678257477072, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1229858870702429 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4536679355694742, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002282805690194143 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48257839264210245, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002694452362144022 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4520495564262539, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019091091728356577 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2184937204533077, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019084340007320115 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2348008779716612, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022303214323899125 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.21807002700335165, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018243298667958761 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.33692930278278704, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0019923203298548866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.36002100624003847, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024053149539646364 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.336313193439231, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001791714678974818 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.38599958036488224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002272531904720447 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4113378384062774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002667546056299814 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.38490335194628533, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002018489316229454 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_0.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..337f5f76a01eaa500d5328a3c0a0b8847ff98fc7 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.15954668519186138, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002100617390256336 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.36309739763041776, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004368615445871372 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2157374708631877, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002508783327476019 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03857312620535916, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012905906114606838 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.09177566784631346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029866334470645636 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.05279448511938101, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001664044690700437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11839547499241576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001646949604960197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.27198898577078495, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035222426961979128 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1604319788556621, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019556284658093183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1250746587410693, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017464873888176696 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.28828909491111, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038785736556512756 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1697976651408494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002135233558113895 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.194933977793404, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09650169688197711 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_1.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f1d92b8f27f2c6b36ea3110ae86cca4714070d3d --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.130085376048313, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019273226219467348 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.31899443410148204, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004375864955378675 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18261422772647087, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00258517997013667 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.027035766317831913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010834329723279196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06843246331973844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002783755254166952 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03829439148207006, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015268296315399182 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.09860861520075329, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014180838773530536 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2439616017337858, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003388978319546548 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.13872448607191418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019214547149178605 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10431817474650688, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015880489605249845 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.25778306821268704, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037812899736781526 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14673205236069722, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002156691093638724 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.638275312642142, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11197811159644402 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_2.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b522252ce9001ae8dcfaf4c8004be09865ca8bf7 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14059986141007405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001980010991959053 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.34704590818514364, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004609783640663682 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.1978001315070318, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002669936318830053 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.033448694107769246, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011984145262489562 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08605872654054372, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.003191665944600775 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04760922765448965, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001702755511678876 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10721276455620249, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014796729903083395 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26727650609085896, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036995069904560843 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1511855886298728, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020261890172370246 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11346372550782761, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016303825401644061 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.28288787023938955, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004038388892533831 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16004925294103853, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022364087298009884 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.0210914476360786, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12454216962651855 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_0.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7d929e15d881f66702b0464d88325f7749217183 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.35, + "acc_stderr": 0.01509065034144423 + }, + "anli_r2": { + "acc": 0.346, + "acc_stderr": 0.015050266127564448 + }, + "anli_r3": { + "acc": 0.355, + "acc_stderr": 0.013819249004047296 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957636, + "f1": 0.2374338624338624 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.5215096594303924, + "acc_stderr": 0.004985162074336112, + "acc_norm": 0.6843258315076678, + "acc_norm_stderr": 0.004638339207348913 + }, + "rte": { + "acc": 0.5270758122743683, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5659037095501184, + "acc_stderr": 0.013929882555694054 + }, + "storycloze_2016": { + "acc": 0.7455905932656334, + "acc_stderr": 0.010071542492663043 + }, + "boolq": { + "acc": 0.6376146788990825, + "acc_stderr": 0.008407308655864048 + }, + "arc_easy": { + "acc": 0.6220538720538721, + "acc_stderr": 0.009949405744045469, + "acc_norm": 0.5395622895622896, + "acc_norm_stderr": 0.010227616386289017 + }, + "arc_challenge": { + "acc": 0.29180887372013653, + "acc_stderr": 0.013284525292403496, + "acc_norm": 0.3216723549488055, + "acc_norm_stderr": 0.013650488084494162 + }, + "sciq": { + "acc": 0.87, + "acc_stderr": 0.010640169792499361, + "acc_norm": 0.807, + "acc_norm_stderr": 0.012486268734370145 + }, + "piqa": { + "acc": 0.7627856365614799, + "acc_stderr": 0.009924694933586364, + "acc_norm": 0.7747551686615887, + "acc_norm_stderr": 0.00974664347103214 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..7d929e15d881f66702b0464d88325f7749217183 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-07-19_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.35, + "acc_stderr": 0.01509065034144423 + }, + "anli_r2": { + "acc": 0.346, + "acc_stderr": 0.015050266127564448 + }, + "anli_r3": { + "acc": 0.355, + "acc_stderr": 0.013819249004047296 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957636, + "f1": 0.2374338624338624 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.5215096594303924, + "acc_stderr": 0.004985162074336112, + "acc_norm": 0.6843258315076678, + "acc_norm_stderr": 0.004638339207348913 + }, + "rte": { + "acc": 0.5270758122743683, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5659037095501184, + "acc_stderr": 0.013929882555694054 + }, + "storycloze_2016": { + "acc": 0.7455905932656334, + "acc_stderr": 0.010071542492663043 + }, + "boolq": { + "acc": 0.6376146788990825, + "acc_stderr": 0.008407308655864048 + }, + "arc_easy": { + "acc": 0.6220538720538721, + "acc_stderr": 0.009949405744045469, + "acc_norm": 0.5395622895622896, + "acc_norm_stderr": 0.010227616386289017 + }, + "arc_challenge": { + "acc": 0.29180887372013653, + "acc_stderr": 0.013284525292403496, + "acc_norm": 0.3216723549488055, + "acc_norm_stderr": 0.013650488084494162 + }, + "sciq": { + "acc": 0.87, + "acc_stderr": 0.010640169792499361, + "acc_norm": 0.807, + "acc_norm_stderr": 0.012486268734370145 + }, + "piqa": { + "acc": 0.7627856365614799, + "acc_stderr": 0.009924694933586364, + "acc_norm": 0.7747551686615887, + "acc_norm_stderr": 0.00974664347103214 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_1.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b95f97c2e1d35c12d87508778037bce6f4b9928 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.328, + "acc_stderr": 0.014853842487270334 + }, + "anli_r2": { + "acc": 0.316, + "acc_stderr": 0.01470919305605713 + }, + "anli_r3": { + "acc": 0.3591666666666667, + "acc_stderr": 0.013855141559780364 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957635, + "f1": 0.21294539321104786 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.5201155148376817, + "acc_stderr": 0.004985741706385719, + "acc_norm": 0.6825333598884684, + "acc_norm_stderr": 0.004645393477680675 + }, + "rte": { + "acc": 0.4657039711191336, + "acc_stderr": 0.030025579819366426 + }, + "winogrande": { + "acc": 0.5872138910812944, + "acc_stderr": 0.013837060648682103 + }, + "storycloze_2016": { + "acc": 0.7365045430251203, + "acc_stderr": 0.010187168219156485 + }, + "boolq": { + "acc": 0.6477064220183486, + "acc_stderr": 0.00835476049390613 + }, + "arc_easy": { + "acc": 0.6422558922558923, + "acc_stderr": 0.009835772757343361, + "acc_norm": 0.6035353535353535, + "acc_norm_stderr": 0.010037412763064529 + }, + "arc_challenge": { + "acc": 0.3054607508532423, + "acc_stderr": 0.0134600804780025, + "acc_norm": 0.3319112627986348, + "acc_norm_stderr": 0.013760988200880538 + }, + "sciq": { + "acc": 0.905, + "acc_stderr": 0.009276910103103329, + "acc_norm": 0.88, + "acc_norm_stderr": 0.0102813280127474 + }, + "piqa": { + "acc": 0.764961915125136, + "acc_stderr": 0.009893146688805326, + "acc_norm": 0.7725788900979326, + "acc_norm_stderr": 0.009779850767847232 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..0b95f97c2e1d35c12d87508778037bce6f4b9928 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-07-19_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.328, + "acc_stderr": 0.014853842487270334 + }, + "anli_r2": { + "acc": 0.316, + "acc_stderr": 0.01470919305605713 + }, + "anli_r3": { + "acc": 0.3591666666666667, + "acc_stderr": 0.013855141559780364 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957635, + "f1": 0.21294539321104786 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.5201155148376817, + "acc_stderr": 0.004985741706385719, + "acc_norm": 0.6825333598884684, + "acc_norm_stderr": 0.004645393477680675 + }, + "rte": { + "acc": 0.4657039711191336, + "acc_stderr": 0.030025579819366426 + }, + "winogrande": { + "acc": 0.5872138910812944, + "acc_stderr": 0.013837060648682103 + }, + "storycloze_2016": { + "acc": 0.7365045430251203, + "acc_stderr": 0.010187168219156485 + }, + "boolq": { + "acc": 0.6477064220183486, + "acc_stderr": 0.00835476049390613 + }, + "arc_easy": { + "acc": 0.6422558922558923, + "acc_stderr": 0.009835772757343361, + "acc_norm": 0.6035353535353535, + "acc_norm_stderr": 0.010037412763064529 + }, + "arc_challenge": { + "acc": 0.3054607508532423, + "acc_stderr": 0.0134600804780025, + "acc_norm": 0.3319112627986348, + "acc_norm_stderr": 0.013760988200880538 + }, + "sciq": { + "acc": 0.905, + "acc_stderr": 0.009276910103103329, + "acc_norm": 0.88, + "acc_norm_stderr": 0.0102813280127474 + }, + "piqa": { + "acc": 0.764961915125136, + "acc_stderr": 0.009893146688805326, + "acc_norm": 0.7725788900979326, + "acc_norm_stderr": 0.009779850767847232 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_2.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3f863650cfd91d3c3485af5f9aa82b9a4be2f723 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.336, + "acc_stderr": 0.014944140233795018 + }, + "anli_r2": { + "acc": 0.329, + "acc_stderr": 0.01486539538592835 + }, + "anli_r3": { + "acc": 0.35083333333333333, + "acc_stderr": 0.013782212417178195 + }, + "cb": { + "acc": 0.19642857142857142, + "acc_stderr": 0.05357142857142858, + "f1": 0.1668300653594771 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263 + }, + "hellaswag": { + "acc": 0.5200159330810595, + "acc_stderr": 0.004985781620467012, + "acc_norm": 0.6863174666401115, + "acc_norm_stderr": 0.004630407476835209 + }, + "rte": { + "acc": 0.48014440433212996, + "acc_stderr": 0.0300727231673172 + }, + "winogrande": { + "acc": 0.5864246250986582, + "acc_stderr": 0.013840971763195306 + }, + "storycloze_2016": { + "acc": 0.743452699091395, + "acc_stderr": 0.01009926092771917 + }, + "boolq": { + "acc": 0.6382262996941896, + "acc_stderr": 0.008404238796949254 + }, + "arc_easy": { + "acc": 0.6481481481481481, + "acc_stderr": 0.009799078929868706, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.00995373765654204 + }, + "arc_challenge": { + "acc": 0.310580204778157, + "acc_stderr": 0.013522292098053059, + "acc_norm": 0.33532423208191126, + "acc_norm_stderr": 0.013796182947785562 + }, + "sciq": { + "acc": 0.913, + "acc_stderr": 0.00891686663074591, + "acc_norm": 0.889, + "acc_norm_stderr": 0.009938701010583726 + }, + "piqa": { + "acc": 0.7573449401523396, + "acc_stderr": 0.0100020025697087, + "acc_norm": 0.764961915125136, + "acc_norm_stderr": 0.009893146688805319 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..3f863650cfd91d3c3485af5f9aa82b9a4be2f723 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-07-19_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.336, + "acc_stderr": 0.014944140233795018 + }, + "anli_r2": { + "acc": 0.329, + "acc_stderr": 0.01486539538592835 + }, + "anli_r3": { + "acc": 0.35083333333333333, + "acc_stderr": 0.013782212417178195 + }, + "cb": { + "acc": 0.19642857142857142, + "acc_stderr": 0.05357142857142858, + "f1": 0.1668300653594771 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263 + }, + "hellaswag": { + "acc": 0.5200159330810595, + "acc_stderr": 0.004985781620467012, + "acc_norm": 0.6863174666401115, + "acc_norm_stderr": 0.004630407476835209 + }, + "rte": { + "acc": 0.48014440433212996, + "acc_stderr": 0.0300727231673172 + }, + "winogrande": { + "acc": 0.5864246250986582, + "acc_stderr": 0.013840971763195306 + }, + "storycloze_2016": { + "acc": 0.743452699091395, + "acc_stderr": 0.01009926092771917 + }, + "boolq": { + "acc": 0.6382262996941896, + "acc_stderr": 0.008404238796949254 + }, + "arc_easy": { + "acc": 0.6481481481481481, + "acc_stderr": 0.009799078929868706, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.00995373765654204 + }, + "arc_challenge": { + "acc": 0.310580204778157, + "acc_stderr": 0.013522292098053059, + "acc_norm": 0.33532423208191126, + "acc_norm_stderr": 0.013796182947785562 + }, + "sciq": { + "acc": 0.913, + "acc_stderr": 0.00891686663074591, + "acc_norm": 0.889, + "acc_norm_stderr": 0.009938701010583726 + }, + "piqa": { + "acc": 0.7573449401523396, + "acc_stderr": 0.0100020025697087, + "acc_norm": 0.764961915125136, + "acc_norm_stderr": 0.009893146688805319 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_3.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0f847e0ac91d717a70d667d7d481f9d4c0fc3be6 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.348, + "acc_stderr": 0.01507060460376841 + }, + "anli_r2": { + "acc": 0.344, + "acc_stderr": 0.015029633724408947 + }, + "anli_r3": { + "acc": 0.3325, + "acc_stderr": 0.013605417345710528 + }, + "cb": { + "acc": 0.3392857142857143, + "acc_stderr": 0.06384226561930825, + "f1": 0.3185837135128588 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263 + }, + "hellaswag": { + "acc": 0.5206134236207927, + "acc_stderr": 0.00498553915978342, + "acc_norm": 0.6902011551483768, + "acc_norm_stderr": 0.00461465517501001 + }, + "rte": { + "acc": 0.48014440433212996, + "acc_stderr": 0.0300727231673172 + }, + "winogrande": { + "acc": 0.601420678768745, + "acc_stderr": 0.01376035717687383 + }, + "storycloze_2016": { + "acc": 0.7504008551576697, + "acc_stderr": 0.010008002459430848 + }, + "boolq": { + "acc": 0.6363914373088685, + "acc_stderr": 0.008413404209789989 + }, + "arc_easy": { + "acc": 0.648989898989899, + "acc_stderr": 0.009793703885101042, + "acc_norm": 0.6393097643097643, + "acc_norm_stderr": 0.009853512108416748 + }, + "arc_challenge": { + "acc": 0.3148464163822526, + "acc_stderr": 0.01357265770308495, + "acc_norm": 0.3250853242320819, + "acc_norm_stderr": 0.013688147309729119 + }, + "sciq": { + "acc": 0.917, + "acc_stderr": 0.008728527206074792, + "acc_norm": 0.911, + "acc_norm_stderr": 0.009008893392651521 + }, + "piqa": { + "acc": 0.7584330794341676, + "acc_stderr": 0.009986718001804467, + "acc_norm": 0.7665941240478781, + "acc_norm_stderr": 0.009869247889520991 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..0f847e0ac91d717a70d667d7d481f9d4c0fc3be6 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-07-25_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.348, + "acc_stderr": 0.01507060460376841 + }, + "anli_r2": { + "acc": 0.344, + "acc_stderr": 0.015029633724408947 + }, + "anli_r3": { + "acc": 0.3325, + "acc_stderr": 0.013605417345710528 + }, + "cb": { + "acc": 0.3392857142857143, + "acc_stderr": 0.06384226561930825, + "f1": 0.3185837135128588 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263 + }, + "hellaswag": { + "acc": 0.5206134236207927, + "acc_stderr": 0.00498553915978342, + "acc_norm": 0.6902011551483768, + "acc_norm_stderr": 0.00461465517501001 + }, + "rte": { + "acc": 0.48014440433212996, + "acc_stderr": 0.0300727231673172 + }, + "winogrande": { + "acc": 0.601420678768745, + "acc_stderr": 0.01376035717687383 + }, + "storycloze_2016": { + "acc": 0.7504008551576697, + "acc_stderr": 0.010008002459430848 + }, + "boolq": { + "acc": 0.6363914373088685, + "acc_stderr": 0.008413404209789989 + }, + "arc_easy": { + "acc": 0.648989898989899, + "acc_stderr": 0.009793703885101042, + "acc_norm": 0.6393097643097643, + "acc_norm_stderr": 0.009853512108416748 + }, + "arc_challenge": { + "acc": 0.3148464163822526, + "acc_stderr": 0.01357265770308495, + "acc_norm": 0.3250853242320819, + "acc_norm_stderr": 0.013688147309729119 + }, + "sciq": { + "acc": 0.917, + "acc_stderr": 0.008728527206074792, + "acc_norm": 0.911, + "acc_norm_stderr": 0.009008893392651521 + }, + "piqa": { + "acc": 0.7584330794341676, + "acc_stderr": 0.009986718001804467, + "acc_norm": 0.7665941240478781, + "acc_norm_stderr": 0.009869247889520991 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_4.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5d45f389f5a7ddf986a1b66641193fa784359241 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.344, + "acc_stderr": 0.015029633724408947 + }, + "anli_r2": { + "acc": 0.345, + "acc_stderr": 0.015039986742055238 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.013759437498874072 + }, + "cb": { + "acc": 0.21428571428571427, + "acc_stderr": 0.055328333517248834, + "f1": 0.1997113997113997 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.5206134236207927, + "acc_stderr": 0.004985539159783419, + "acc_norm": 0.6900019916351324, + "acc_norm_stderr": 0.004615472210316043 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.6077348066298343, + "acc_stderr": 0.013722400462000888 + }, + "storycloze_2016": { + "acc": 0.7509353287012293, + "acc_stderr": 0.010000841162740146 + }, + "boolq": { + "acc": 0.636085626911315, + "acc_stderr": 0.00841491890912884 + }, + "arc_easy": { + "acc": 0.6632996632996633, + "acc_stderr": 0.009697166595752475, + "acc_norm": 0.6447811447811448, + "acc_norm_stderr": 0.009820245899287124 + }, + "arc_challenge": { + "acc": 0.3199658703071672, + "acc_stderr": 0.013631345807016196, + "acc_norm": 0.3447098976109215, + "acc_norm_stderr": 0.01388881628678211 + }, + "sciq": { + "acc": 0.927, + "acc_stderr": 0.008230354715244062, + "acc_norm": 0.908, + "acc_norm_stderr": 0.009144376393151108 + }, + "piqa": { + "acc": 0.7665941240478781, + "acc_stderr": 0.009869247889521007, + "acc_norm": 0.7682263329706203, + "acc_norm_stderr": 0.00984514377279404 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..5d45f389f5a7ddf986a1b66641193fa784359241 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-06-46_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.344, + "acc_stderr": 0.015029633724408947 + }, + "anli_r2": { + "acc": 0.345, + "acc_stderr": 0.015039986742055238 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.013759437498874072 + }, + "cb": { + "acc": 0.21428571428571427, + "acc_stderr": 0.055328333517248834, + "f1": 0.1997113997113997 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.5206134236207927, + "acc_stderr": 0.004985539159783419, + "acc_norm": 0.6900019916351324, + "acc_norm_stderr": 0.004615472210316043 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.6077348066298343, + "acc_stderr": 0.013722400462000888 + }, + "storycloze_2016": { + "acc": 0.7509353287012293, + "acc_stderr": 0.010000841162740146 + }, + "boolq": { + "acc": 0.636085626911315, + "acc_stderr": 0.00841491890912884 + }, + "arc_easy": { + "acc": 0.6632996632996633, + "acc_stderr": 0.009697166595752475, + "acc_norm": 0.6447811447811448, + "acc_norm_stderr": 0.009820245899287124 + }, + "arc_challenge": { + "acc": 0.3199658703071672, + "acc_stderr": 0.013631345807016196, + "acc_norm": 0.3447098976109215, + "acc_norm_stderr": 0.01388881628678211 + }, + "sciq": { + "acc": 0.927, + "acc_stderr": 0.008230354715244062, + "acc_norm": 0.908, + "acc_norm_stderr": 0.009144376393151108 + }, + "piqa": { + "acc": 0.7665941240478781, + "acc_stderr": 0.009869247889521007, + "acc_norm": 0.7682263329706203, + "acc_norm_stderr": 0.00984514377279404 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_5.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_5.json new file mode 100644 index 0000000000000000000000000000000000000000..70f7bf98fc89840e085695409830500dfbf177d6 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_5.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.34, + "acc_stderr": 0.014987482264363935 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.014955087918653605 + }, + "anli_r3": { + "acc": 0.3325, + "acc_stderr": 0.013605417345710526 + }, + "cb": { + "acc": 0.23214285714285715, + "acc_stderr": 0.0569293902400011, + "f1": 0.223351041141572 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816 + }, + "hellaswag": { + "acc": 0.5209121688906593, + "acc_stderr": 0.004985415250690905, + "acc_norm": 0.689205337582155, + "acc_norm_stderr": 0.004618730353217064 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.030094698123239966 + }, + "winogrande": { + "acc": 0.5935280189423836, + "acc_stderr": 0.01380444869775337 + }, + "storycloze_2016": { + "acc": 0.7498663816141101, + "acc_stderr": 0.010015143382536456 + }, + "boolq": { + "acc": 0.6409785932721712, + "acc_stderr": 0.008390241754319908 + }, + "arc_easy": { + "acc": 0.6536195286195287, + "acc_stderr": 0.009763542075695738, + "acc_norm": 0.6401515151515151, + "acc_norm_stderr": 0.009848484848484836 + }, + "arc_challenge": { + "acc": 0.3310580204778157, + "acc_stderr": 0.013752062419817834, + "acc_norm": 0.3447098976109215, + "acc_norm_stderr": 0.013888816286782112 + }, + "sciq": { + "acc": 0.923, + "acc_stderr": 0.008434580140240648, + "acc_norm": 0.912, + "acc_norm_stderr": 0.008963053962592074 + }, + "piqa": { + "acc": 0.7606093579978237, + "acc_stderr": 0.009955884250291688, + "acc_norm": 0.7747551686615887, + "acc_norm_stderr": 0.009746643471032136 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..70f7bf98fc89840e085695409830500dfbf177d6 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-46_5shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.34, + "acc_stderr": 0.014987482264363935 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.014955087918653605 + }, + "anli_r3": { + "acc": 0.3325, + "acc_stderr": 0.013605417345710526 + }, + "cb": { + "acc": 0.23214285714285715, + "acc_stderr": 0.0569293902400011, + "f1": 0.223351041141572 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816 + }, + "hellaswag": { + "acc": 0.5209121688906593, + "acc_stderr": 0.004985415250690905, + "acc_norm": 0.689205337582155, + "acc_norm_stderr": 0.004618730353217064 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.030094698123239966 + }, + "winogrande": { + "acc": 0.5935280189423836, + "acc_stderr": 0.01380444869775337 + }, + "storycloze_2016": { + "acc": 0.7498663816141101, + "acc_stderr": 0.010015143382536456 + }, + "boolq": { + "acc": 0.6409785932721712, + "acc_stderr": 0.008390241754319908 + }, + "arc_easy": { + "acc": 0.6536195286195287, + "acc_stderr": 0.009763542075695738, + "acc_norm": 0.6401515151515151, + "acc_norm_stderr": 0.009848484848484836 + }, + "arc_challenge": { + "acc": 0.3310580204778157, + "acc_stderr": 0.013752062419817834, + "acc_norm": 0.3447098976109215, + "acc_norm_stderr": 0.013888816286782112 + }, + "sciq": { + "acc": 0.923, + "acc_stderr": 0.008434580140240648, + "acc_norm": 0.912, + "acc_norm_stderr": 0.008963053962592074 + }, + "piqa": { + "acc": 0.7606093579978237, + "acc_stderr": 0.009955884250291688, + "acc_norm": 0.7747551686615887, + "acc_norm_stderr": 0.009746643471032136 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17550129f3162d7f72b614b8cfe4b673bfd98d31 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5360651854033321, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04256493048481422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0899710062520438, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002733564353594117}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33063321497259557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005082809158106232}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11703731094627681, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023099010503697163}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.039336196641808305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001715101531039642}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14377777116972676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003290079655812811}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.050209876854654424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00139566067725402}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.079768875505422, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024190846876120864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.29935042842995513, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004616391395045465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10410690412481492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001970211495017576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0826555500749783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002555867928952612}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30355552506299893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004656831028738861}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10721436888153839, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021127388747576538}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e94fc9e3dce22c16ede56c4528736f1a261d4081 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6802224009591102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04492815625354331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07461313068432787, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017662222515359694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.372595398614972, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005022894768490944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11390823787630369, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002092748719741799}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03144260517505359, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010954007812074113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16216117762517218, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003540557825236836}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04788274742708309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001335468400942394}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06430823474158391, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001450822149093989}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.32776952027123735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004463575595509288}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.09862949956148871, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017348311881018063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06792680129399332, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016204262837167783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.33968125496997575, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0045445995282505924}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10371590290401189, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001919446152689098}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e7ec67c96ec389e457e8b097577ec5e4854cfc04 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7421753234818311, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.058142097599066474}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07467676852089566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016365713066735295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.39206002680015317, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005060271412913158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11568167659841286, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019756616545099706}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03201968689196959, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010606073690619061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17433943475684258, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0036664073103424713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.049111982430960446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012783990550174547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06439207073032543, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001373955096716673}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34371763245797454, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004505609280479333}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10005506236504283, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001651667409449542}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06782658473689997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015027459055493005}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.35767707252188435, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004646683404394275}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1050019387672539, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018008044515920333}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e3735b0458e0513f41a6aa566cc6cbe4a187d63 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7188535617714238, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03519944556258603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07388743389054267, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014922551498186853}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.40655159407197466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005087707266532105}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11579215217674567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018264739390039843}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.0314899543029952, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009557711556575558}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1826251549753492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003641907390766911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04901217699390464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011685479931085388}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06252862451464879, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00121227139148576}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3504975578791793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00448181425469438}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.0983386270641459, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001492600011551131}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06730929990997614, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013838254439389821}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3708026802911801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004623882247190055}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10530117384191293, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016684330187746533}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..747a7897bfe7a3082df97790d5cae6604bcac4ab --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7356439715076686, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.034000125678247964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07276465535536372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001402274340634281}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4010732633333032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005031473102978275}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1147943522144661, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018063385532961329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.030441136569863136, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008680169230332024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17998860942195416, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0035899170058648713}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04799828271736168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011481297831730717}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.061409018888316014, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001162871962197699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.34312128735122893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004376699961011487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.09700958657806398, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014957916703609423}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06590967070216937, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012895737406364428}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3647408763184601, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004596517842636764}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10389368450136928, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016495601390011836}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b1b2444bd1c95aa9741d41f11f5ddfaa62076c3a --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.774498674641057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.046025666098424615}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07168554541147722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013567491666828951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.40311382031478993, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005093519815631301}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1145076719006982, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017986089153009496}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.029640249400459045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008248970598907547}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.17922361100140816, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0036463149274911922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.04748834860177524, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011381885469089572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.05979676606804081, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010829180120786699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3429993601688589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0044165337458333025}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.09581522945526423, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014457046424409396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0647765765675484, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012251354927212512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.36627971544729243, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00466986591996086}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10343462084662852, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016190969843153712}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..deb96a1b930dcaed1c00ae37bf8e1a6e7e38ab17 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.16713881881245704, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019675559614062317}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2675635999010821, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002514916375462715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18849953981212825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017651084291359991}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03129812627625643, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008361148272854467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05071754722622078, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001311834312898096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03506973226946951, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000828365798842279}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12151132876190665, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001367091894817583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20104126997571714, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019684739493456994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13822063710515434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001209603068162531}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.15537471654065166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018202212529241414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.25004836431590777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023701023893618026}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17552265116019092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001631387985609637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5232593023758256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04747265065152494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a8e4ce9ecd911ecee6bd6bac12cceb4a3f750d25 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17053090746783298, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019872482776833456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2862592054300288, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025208248871725165}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1984602458949581, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001835524387024883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.034781757767719336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007914156120289567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05819412473089733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013767099922495528}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04000846126572821, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008452030738686245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11920675042988256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012760690917858948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2090473969119961, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019879307284216374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14054619741164162, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012057251064722665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.16072751156146212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018664748954273353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2708352805729064, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023993133135588726}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18719993560041495, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017213110797211495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.8880118627721794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05942943111971925}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..22045b31868212c59cba09e6b4ba8b21cc3d1a30 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 1.7691179321908588, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05199631794487594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.12404388162125599, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014296085692360062}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.24113298891528734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0021195221866358305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.1597457244947749, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015754443648131056}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.028289803328052695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008570729362836853}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.05442101911192778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014043108131977875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.035999793072600185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009624756735963167}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.11310129172937744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012492394490123515}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.22135445621870045, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001908698378645529}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.14600635284564262, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013795631395537417}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.10515677540232808, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012984677845430614}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.20544674202250837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001966897239298014}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.1355872624921452, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014398399847291973}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aa7e2caeaf7a040b2fd359b187913c0635bd3426 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.651044586681334, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07926438990816918}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2716212972945311, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026058040333959537}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4236216032222433, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026180226506166463}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3078278785254422, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001908062705345462}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10952561587924448, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016418857574103474}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.17185473066549486, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018971444386739766}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.12259566762702832, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013108581526452968}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20793662314020775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001966736398197739}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3306629249371328, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021642636770467646}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.23718705321133973, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00138256279237305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.2271295898568511, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023078665148769063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.35503755430035955, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002474276440762186}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2573509246055057, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017483657121627276}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f6521f103eb8ce5d270861bad0e2584739037bdc --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.8424163448955415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08355827344599492}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2566016107317033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022067745089076256}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.461194891770376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025273079440682293}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.31288302842111376, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017808501976129172}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10663824144759744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001372692263793003}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.1963344119197119, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019212562311138061}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.12981003854776116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001232713595041452}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.19466241734213402, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015883723269025356}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35748703923277775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002135743881465239}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.23920101858831294, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012833315214473093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.2160451049013204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019568292320360082}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.38977835292938257, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002475205894007756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.26382166088143494, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016894009510511382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b9a515524675a83f9273b363cc0223444b30b32a --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.810474759979567, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07579006844458654}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.24309828934304148, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019005898753670413}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4670566019113305, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023932775436402193}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.307699846077658, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016836798456212607}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10168781845908033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011883824391395869}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.20209647607764591, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019104221912810875}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.12918070346005242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011654611122014701}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.18580091003866794, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014382176327432795}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.36221484092783923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020685753646924234}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.23605748656654202, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012419184728918431}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.20525158463645873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017314120087883877}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3947742486081849, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002360306367469496}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.25979950139324387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016108185985968808}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7c4df208fb995968d71c4cc657828e2ceefa3125 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.856583823504068, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08590752222075627}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.24083912701088211, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001891795976143307}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46987502312106944, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002373970795855411}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3072297336966097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001677600068476274}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10194079575537923, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012738284927660533}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.20486607531207773, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019180923641256352}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.13013649575708963, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011872264816164021}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.18407870093260886, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00147269880081677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3634519313727439, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020191238189298604}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.23535954148408947, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001235256889547661}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.20324551641227329, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001730862935580683}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3970250584377568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00232405668117713}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2592529609283163, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015947975783409823}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..92f8fbee08451d8fc1303c7d134d93815a7cfe49 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.701988223904371, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05974327947875344}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2357192890434931, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017709653279963382}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.46793875744209246, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023377619498614883}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.30342180002404195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016429215832907001}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.09834833044152837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011541319656096648}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2017968785574919, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001929650748227801}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.12711050572770574, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001182240229485702}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.17902630726475285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012897932939114531}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.36086666712927185, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019704166593789243}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2315222472386274, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011861982598106887}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.19964816084000173, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016311639902271977}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39633205907689883, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022975718064400996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.256915761431013, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015851952901884037}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_0.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c22f7126136b4a8451d94a2f7f1d85433f7bcdc1 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1655773024489004, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028576845139351987}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.32851562505932314, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039803284921081}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2085969207130687, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002677475599144402}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.036089021388490294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015969906559037688}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.07051409622481412, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025672161932226503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.044656791266145565, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016785808050176795}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.12315934281916667, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022342850887312926}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2450990337124764, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031721420821518784}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15500054113220843, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002069637277533473}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.130324877139233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002282842578765438}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2618161404002937, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034400702646342324}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1648368106728737, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021862489460295126}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7324193654708817, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13416126334094292}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_1.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d560771ecbceadf22ac3ee5d7e22fda8a488cf13 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.11802826309532148, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001749157965306058}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2892400892348949, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003843967130143464}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.16538496371657396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022823923941345326}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.017821354334989277, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008141612570301904}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.04545518558443322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00206477086181511}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.025192202332776397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011296247751961938}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.08829393492717166, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011978449597419942}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.219464699636153, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029458921049705118}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.12421960300726272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016143054069980196}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.09409421229331506, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013239443870656413}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.23385506502219394, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00324955152033377}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.13238865606315464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017891980246226586}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.9718103240280566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0618162240060556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_2.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3591ce37f1d26f7bfb60a1bdee9a6fd390069f42 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.11300176611968675, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015556940763036152}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.28221717850396943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003575100110282272}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.15944247539707976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002076895889562475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.016338317353700334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007066281144508859}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.04249427144713084, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018743700982079647}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.023271326997139498, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000998854505955608}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.08541393540574391, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011196684087793841}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.21475231259583918, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027096441989616395}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.12070835773462767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015063108121480115}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0910255058370105, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012384505621305965}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.22901792345741215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029934117569162446}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.12867240195403112, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0016705058332119036}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.8276490187392188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06507121832672531}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..88e7f34fdfc729354a2b4e815977a92dd69c3198 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf88e33a55c34025a07d79dab53c6766ef807950d54b473cf7f703c65dd53959 +size 4002555 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a89e0ca4641f499dc92e40e884606c0fcbddc4d6 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0705c6d4a948db8aec235d02c1ff2486623824312c8597fda9adb8353386406 +size 5153682 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a3a6d94d1bce6f0f46064cfd54910ee9f2061464 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31bf8f713d4f1a9dc876e43d47675e9d088e60a72c226e67cc0252d4277c096f +size 6063528 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..94a8db888fa2288918fb2b006b36848510235ddd --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997d5edf5956cd3ddeccac543cd11ae720e3b04c21b0bcb075e614533cb42dfd +size 6974107 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d5ab5677839d2adff2959ba141ce4ec4e686fe12 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2ea27bbdd65361eeffdf1415da7f820b120208103b474fc7e2039792359c57f +size 7855474 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f00233236bae38dfe069ddde7cd51c7418b552a --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53254de4132d7ff9a3f710073ff11235f80c15fec827cbfb21e492e54b0a51a4 +size 8774285 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c9c186885f0b9aecaff2e473fb89b60d99d77f13 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12c6bb90f2100ca84e9c2f2d9623a119d7d998d723deba5ef4bb73359df096ae +size 7692206 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0711b40f9c035556c94ecc23c7fb5cb72844e243 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60b3a8fb612a485701a3a01e9105c0fc2f03bc2cc494a69a1a60cc602e241dee +size 13321079 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6ccd861e0ffa57a76d6dced1a57b45ad9df2b1d3 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ceb73aedec05124d413a0fcae8bdb8bf4fab207ade1e62df12a032988c11fc +size 4489612 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9e5a0585313656078554d59a181be9e605900f92 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42375f7039b6762a6af14e135a9b76e6231d4224a69839655d662bd727db2874 +size 5405120 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d95cdae78e96afc1bd829c3de371c39fd82047d --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5735e8345d99ffe079a3ba23a5e2dd8c9c7b538e6bdc2baa082e1b264b5f7e7b +size 6547631 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0f917c3582e2da9c6b28778d3fdc75dd61a5f490 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e484d025c7c1eb3edabe941bd11e6c71fea45873e1aa031d643867cb30aaa5 +size 7655673 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7d42ee62a1069cf8224fb95aeddfd40ced82880a --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d0a314a224f5d10caa84102231a518a643725fa42ce44d84d6fbae7db49dbc5 +size 8739962 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bca5a6eb0e12c36345b7518047f1a7a3a82c6abf --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e3ff0e05dc30b2c9f35b210cd5c10a6b6b8e86f69e1b777dae2f9742318caba +size 9833882 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_0.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a7eec43cc0050f1817d3f132d861f7a1aaa848b --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f48d04f8df448365eeb3fb32c54a2214fc39a94af254f24af1f81ecdf75aa6c +size 2799092 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_1.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5652783ec3b8f44ab207f6bba6ff10e8b37435f2 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc43687a012ad536184411af971a152e682f796c1fbf3e763e36252e6913578 +size 5104164 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_2.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..64ee67033bfcc97e68d7a730b6f70cfb4efa5190 --- /dev/null +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b67b949168f7795c80af96ef2ddb9e4ae8b5b0622e84451e181f15c5690028d +size 7382602 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3442254d433669c22337ed7c7d25d147fed6f20f --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5360651854033321, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04256493048481422 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0899710062520438, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002733564353594117 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33063321497259557, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005082809158106232 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11703731094627681, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023099010503697163 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.039336196641808305, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001715101531039642 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14377777116972676, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003290079655812811 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.050209876854654424, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00139566067725402 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.079768875505422, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0024190846876120864 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.29935042842995513, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004616391395045465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10410690412481492, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001970211495017576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0826555500749783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002555867928952612 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30355552506299893, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004656831028738861 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10721436888153839, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021127388747576538 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..50be49201c15804e6356193f5484840586d77438 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6802224009591102, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04492815625354331 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07461313068432787, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017662222515359694 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.372595398614972, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005022894768490944 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11390823787630369, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002092748719741799 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03144260517505359, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010954007812074113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16216117762517218, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003540557825236836 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04788274742708309, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001335468400942394 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06430823474158391, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001450822149093989 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.32776952027123735, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004463575595509288 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.09862949956148871, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017348311881018063 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06792680129399332, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016204262837167783 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.33968125496997575, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0045445995282505924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10371590290401189, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001919446152689098 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0d6022f0384d03cf636c190f05f52982fbc439b7 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7421753234818311, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.058142097599066474 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07467676852089566, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016365713066735295 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.39206002680015317, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005060271412913158 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11568167659841286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019756616545099706 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03201968689196959, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010606073690619061 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17433943475684258, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0036664073103424713 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.049111982430960446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012783990550174547 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06439207073032543, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001373955096716673 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.34371763245797454, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004505609280479333 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10005506236504283, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001651667409449542 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06782658473689997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015027459055493005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.35767707252188435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004646683404394275 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1050019387672539, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018008044515920333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7c398fb433604b25f18289d7ad4718dca59d4ac6 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7188535617714238, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03519944556258603 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07388743389054267, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014922551498186853 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.40655159407197466, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005087707266532105 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11579215217674567, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018264739390039843 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.0314899543029952, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009557711556575558 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1826251549753492, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003641907390766911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04901217699390464, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011685479931085388 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06252862451464879, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00121227139148576 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3504975578791793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00448181425469438 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.0983386270641459, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001492600011551131 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06730929990997614, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013838254439389821 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3708026802911801, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004623882247190055 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10530117384191293, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016684330187746533 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8cf7f804d99fb8b1f7deb7887ce4987ea0c9893e --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7356439715076686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.034000125678247964 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07276465535536372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001402274340634281 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4010732633333032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005031473102978275 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1147943522144661, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018063385532961329 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.030441136569863136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008680169230332024 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17998860942195416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0035899170058648713 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04799828271736168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011481297831730717 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.061409018888316014, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001162871962197699 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.34312128735122893, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004376699961011487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.09700958657806398, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014957916703609423 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06590967070216937, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012895737406364428 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3647408763184601, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004596517842636764 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10389368450136928, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016495601390011836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c092945027e20b38eb386c19fb78636e358eb8f4 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.774498674641057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.046025666098424615 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07168554541147722, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013567491666828951 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.40311382031478993, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005093519815631301 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1145076719006982, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017986089153009496 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.029640249400459045, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008248970598907547 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17922361100140816, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0036463149274911922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04748834860177524, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011381885469089572 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.05979676606804081, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010829180120786699 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3429993601688589, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044165337458333025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.09581522945526423, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014457046424409396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0647765765675484, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012251354927212512 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.36627971544729243, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00466986591996086 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10343462084662852, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016190969843153712 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cbe3063a85f30fe75a705b9a2e568e19d91e7915 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.16713881881245704, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019675559614062317 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2675635999010821, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002514916375462715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.18849953981212825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017651084291359991 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.03129812627625643, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008361148272854467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05071754722622078, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001311834312898096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03506973226946951, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000828365798842279 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12151132876190665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001367091894817583 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20104126997571714, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019684739493456994 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13822063710515434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001209603068162531 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.15537471654065166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018202212529241414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.25004836431590777, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023701023893618026 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17552265116019092, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001631387985609637 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.5232593023758256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04747265065152494 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..388cbf0abf4e90c1841a99279f635fa227495873 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17053090746783298, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019872482776833456 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2862592054300288, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025208248871725165 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1984602458949581, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001835524387024883 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.034781757767719336, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007914156120289567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05819412473089733, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013767099922495528 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04000846126572821, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008452030738686245 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11920675042988256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012760690917858948 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.2090473969119961, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019879307284216374 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14054619741164162, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012057251064722665 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.16072751156146212, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018664748954273353 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2708352805729064, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023993133135588726 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.18719993560041495, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017213110797211495 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.8880118627721794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05942943111971925 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c34494f2ff0c545dc907c65e0cf5acfe137322c7 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 1.7691179321908588, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05199631794487594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.12404388162125599, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014296085692360062 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.24113298891528734, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0021195221866358305 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.1597457244947749, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0015754443648131056 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.028289803328052695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0008570729362836853 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.05442101911192778, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0014043108131977875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.035999793072600185, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0009624756735963167 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.11310129172937744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012492394490123515 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.22135445621870045, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.001908698378645529 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.14600635284564262, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013795631395537417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.10515677540232808, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0012984677845430614 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.20544674202250837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.001966897239298014 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.1355872624921452, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014398399847291973 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6bd726fa6bd4c0c992a4b26c8d4439dc28562c8a --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.651044586681334, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07926438990816918 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2716212972945311, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0026058040333959537 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4236216032222433, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026180226506166463 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3078278785254422, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001908062705345462 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10952561587924448, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016418857574103474 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.17185473066549486, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018971444386739766 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12259566762702832, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013108581526452968 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.20793662314020775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001966736398197739 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3306629249371328, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021642636770467646 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.23718705321133973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00138256279237305 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.2271295898568511, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0023078665148769063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.35503755430035955, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002474276440762186 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2573509246055057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017483657121627276 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5a1d8583363ddc1ebde6c87e123d9f97cd36bc62 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.8424163448955415, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08355827344599492 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2566016107317033, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0022067745089076256 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.461194891770376, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025273079440682293 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.31288302842111376, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017808501976129172 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10663824144759744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001372692263793003 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1963344119197119, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019212562311138061 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12981003854776116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001232713595041452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.19466241734213402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0015883723269025356 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.35748703923277775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002135743881465239 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.23920101858831294, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0012833315214473093 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.2160451049013204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0019568292320360082 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.38977835292938257, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002475205894007756 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.26382166088143494, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016894009510511382 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..93ccbfa926e7c960ab13ce505d5e8eacd94663a0 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.810474759979567, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07579006844458654 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.24309828934304148, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0019005898753670413 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4670566019113305, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0023932775436402193 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.307699846077658, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016836798456212607 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10168781845908033, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011883824391395869 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.20209647607764591, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019104221912810875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12918070346005242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0011654611122014701 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.18580091003866794, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014382176327432795 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.36221484092783923, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020685753646924234 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.23605748656654202, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0012419184728918431 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.20525158463645873, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0017314120087883877 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3947742486081849, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002360306367469496 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.25979950139324387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016108185985968808 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8d7b758739ede3ee8ad03e8b464ac21f613ede5c --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.856583823504068, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08590752222075627 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.24083912701088211, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001891795976143307 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46987502312106944, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002373970795855411 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3072297336966097, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001677600068476274 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10194079575537923, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0012738284927660533 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.20486607531207773, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019180923641256352 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.13013649575708963, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0011872264816164021 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.18407870093260886, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00147269880081677 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3634519313727439, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020191238189298604 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.23535954148408947, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001235256889547661 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.20324551641227329, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001730862935580683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3970250584377568, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00232405668117713 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2592529609283163, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015947975783409823 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a4733db008512b4768ebd7aceb1ae062926d279d --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.701988223904371, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.05974327947875344 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2357192890434931, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0017709653279963382 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46793875744209246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0023377619498614883 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.30342180002404195, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016429215832907001 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.09834833044152837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011541319656096648 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2017968785574919, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001929650748227801 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12711050572770574, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001182240229485702 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.17902630726475285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012897932939114531 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.36086666712927185, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0019704166593789243 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2315222472386274, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0011861982598106887 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.19964816084000173, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0016311639902271977 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.39633205907689883, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0022975718064400996 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.256915761431013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015851952901884037 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_0.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3cbe369c57367239a3ad30912279c59073a39839 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1655773024489004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0028576845139351987 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.32851562505932314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0039803284921081 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2085969207130687, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002677475599144402 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.036089021388490294, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0015969906559037688 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07051409622481412, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0025672161932226503 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.044656791266145565, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016785808050176795 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.12315934281916667, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0022342850887312926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2450990337124764, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0031721420821518784 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15500054113220843, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002069637277533473 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.130324877139233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002282842578765438 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2618161404002937, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034400702646342324 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1648368106728737, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021862489460295126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.7324193654708817, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.13416126334094292 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_1.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..efb309864356029687079cb4212980fad57b4ed9 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.11802826309532148, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001749157965306058 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.2892400892348949, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003843967130143464 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.16538496371657396, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0022823923941345326 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.017821354334989277, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008141612570301904 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.04545518558443322, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00206477086181511 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.025192202332776397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011296247751961938 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.08829393492717166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011978449597419942 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.219464699636153, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0029458921049705118 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.12421960300726272, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0016143054069980196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.09409421229331506, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0013239443870656413 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.23385506502219394, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00324955152033377 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.13238865606315464, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0017891980246226586 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.9718103240280566, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0618162240060556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_2.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..91ea4a6252a118eeadec1193dce6cf38e6618e45 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.11300176611968675, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0015556940763036152 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.28221717850396943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003575100110282272 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.15944247539707976, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002076895889562475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.016338317353700334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007066281144508859 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.04249427144713084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018743700982079647 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.023271326997139498, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.000998854505955608 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.08541393540574391, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011196684087793841 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.21475231259583918, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0027096441989616395 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.12070835773462767, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0015063108121480115 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0910255058370105, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012384505621305965 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.22901792345741215, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0029934117569162446 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.12867240195403112, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0016705058332119036 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.8276490187392188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06507121832672531 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cea3eebe18894dbcf770ee9a54525a4955d070ec --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.333, + "acc_stderr": 0.014910846164229863 + }, + "anli_r2": { + "acc": 0.35, + "acc_stderr": 0.015090650341444231 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077249 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.3432539682539683 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.45757817167894843, + "acc_stderr": 0.004971789638563322, + "acc_norm": 0.5881298546106354, + "acc_norm_stderr": 0.004911659884506154 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.5580110497237569, + "acc_stderr": 0.01395758407910899 + }, + "storycloze_2016": { + "acc": 0.6900053447354356, + "acc_stderr": 0.010695042806212553 + }, + "boolq": { + "acc": 0.5504587155963303, + "acc_stderr": 0.008700409761350798 + }, + "arc_easy": { + "acc": 0.5361952861952862, + "acc_stderr": 0.010232865550346727, + "acc_norm": 0.4823232323232323, + "acc_norm_stderr": 0.010253369805698971 + }, + "arc_challenge": { + "acc": 0.2636518771331058, + "acc_stderr": 0.012875929151297056, + "acc_norm": 0.30631399317406144, + "acc_norm_stderr": 0.013470584417276513 + }, + "sciq": { + "acc": 0.765, + "acc_stderr": 0.013414729030247116, + "acc_norm": 0.68, + "acc_norm_stderr": 0.014758652303574885 + }, + "piqa": { + "acc": 0.7334058759521219, + "acc_stderr": 0.010316749863541369, + "acc_norm": 0.733949945593036, + "acc_norm_stderr": 0.010310039263352827 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..cea3eebe18894dbcf770ee9a54525a4955d070ec --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.333, + "acc_stderr": 0.014910846164229863 + }, + "anli_r2": { + "acc": 0.35, + "acc_stderr": 0.015090650341444231 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077249 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.3432539682539683 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.45757817167894843, + "acc_stderr": 0.004971789638563322, + "acc_norm": 0.5881298546106354, + "acc_norm_stderr": 0.004911659884506154 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.5580110497237569, + "acc_stderr": 0.01395758407910899 + }, + "storycloze_2016": { + "acc": 0.6900053447354356, + "acc_stderr": 0.010695042806212553 + }, + "boolq": { + "acc": 0.5504587155963303, + "acc_stderr": 0.008700409761350798 + }, + "arc_easy": { + "acc": 0.5361952861952862, + "acc_stderr": 0.010232865550346727, + "acc_norm": 0.4823232323232323, + "acc_norm_stderr": 0.010253369805698971 + }, + "arc_challenge": { + "acc": 0.2636518771331058, + "acc_stderr": 0.012875929151297056, + "acc_norm": 0.30631399317406144, + "acc_norm_stderr": 0.013470584417276513 + }, + "sciq": { + "acc": 0.765, + "acc_stderr": 0.013414729030247116, + "acc_norm": 0.68, + "acc_norm_stderr": 0.014758652303574885 + }, + "piqa": { + "acc": 0.7334058759521219, + "acc_stderr": 0.010316749863541369, + "acc_norm": 0.733949945593036, + "acc_norm_stderr": 0.010310039263352827 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a97390b22c5b74ef7af35988448b467b350cf7d1 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.357, + "acc_stderr": 0.015158521721486774 + }, + "anli_r2": { + "acc": 0.334, + "acc_stderr": 0.014922019523732967 + }, + "anli_r3": { + "acc": 0.37416666666666665, + "acc_stderr": 0.01397501560175897 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.26788664379209554 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.45339573790081655, + "acc_stderr": 0.004968058944472161, + "acc_norm": 0.5857398924517029, + "acc_norm_stderr": 0.004915870966174404 + }, + "rte": { + "acc": 0.4657039711191336, + "acc_stderr": 0.030025579819366422 + }, + "winogrande": { + "acc": 0.5509076558800315, + "acc_stderr": 0.01397945938914086 + }, + "storycloze_2016": { + "acc": 0.6878674505611972, + "acc_stderr": 0.010715220346279681 + }, + "boolq": { + "acc": 0.517125382262997, + "acc_stderr": 0.008739923994130054 + }, + "arc_easy": { + "acc": 0.5307239057239057, + "acc_stderr": 0.010240395584815237, + "acc_norm": 0.5084175084175084, + "acc_norm_stderr": 0.010258329515226462 + }, + "arc_challenge": { + "acc": 0.2568259385665529, + "acc_stderr": 0.0127669237941168, + "acc_norm": 0.31399317406143346, + "acc_norm_stderr": 0.013562691224726297 + }, + "sciq": { + "acc": 0.799, + "acc_stderr": 0.012679107214617328, + "acc_norm": 0.751, + "acc_norm_stderr": 0.0136816002787023 + }, + "piqa": { + "acc": 0.7399347116430903, + "acc_stderr": 0.010234893249061303, + "acc_norm": 0.7328618063112078, + "acc_norm_stderr": 0.01032344049261243 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..a97390b22c5b74ef7af35988448b467b350cf7d1 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.357, + "acc_stderr": 0.015158521721486774 + }, + "anli_r2": { + "acc": 0.334, + "acc_stderr": 0.014922019523732967 + }, + "anli_r3": { + "acc": 0.37416666666666665, + "acc_stderr": 0.01397501560175897 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.26788664379209554 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.45339573790081655, + "acc_stderr": 0.004968058944472161, + "acc_norm": 0.5857398924517029, + "acc_norm_stderr": 0.004915870966174404 + }, + "rte": { + "acc": 0.4657039711191336, + "acc_stderr": 0.030025579819366422 + }, + "winogrande": { + "acc": 0.5509076558800315, + "acc_stderr": 0.01397945938914086 + }, + "storycloze_2016": { + "acc": 0.6878674505611972, + "acc_stderr": 0.010715220346279681 + }, + "boolq": { + "acc": 0.517125382262997, + "acc_stderr": 0.008739923994130054 + }, + "arc_easy": { + "acc": 0.5307239057239057, + "acc_stderr": 0.010240395584815237, + "acc_norm": 0.5084175084175084, + "acc_norm_stderr": 0.010258329515226462 + }, + "arc_challenge": { + "acc": 0.2568259385665529, + "acc_stderr": 0.0127669237941168, + "acc_norm": 0.31399317406143346, + "acc_norm_stderr": 0.013562691224726297 + }, + "sciq": { + "acc": 0.799, + "acc_stderr": 0.012679107214617328, + "acc_norm": 0.751, + "acc_norm_stderr": 0.0136816002787023 + }, + "piqa": { + "acc": 0.7399347116430903, + "acc_stderr": 0.010234893249061303, + "acc_norm": 0.7328618063112078, + "acc_norm_stderr": 0.01032344049261243 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b11af2a2b21df78c7fbe0624c9997898bb3e381a --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.349, + "acc_stderr": 0.0150806639915631 + }, + "anli_r2": { + "acc": 0.335, + "acc_stderr": 0.014933117490932566 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.013759437498874075 + }, + "cb": { + "acc": 0.375, + "acc_stderr": 0.06527912098338669, + "f1": 0.25396825396825395 + }, + "copa": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794 + }, + "hellaswag": { + "acc": 0.45797649870543716, + "acc_stderr": 0.004972126523031945, + "acc_norm": 0.5928101971718781, + "acc_norm_stderr": 0.004903066639761946 + }, + "rte": { + "acc": 0.4981949458483754, + "acc_stderr": 0.030096267148976626 + }, + "winogrande": { + "acc": 0.5651144435674822, + "acc_stderr": 0.013932814110418013 + }, + "storycloze_2016": { + "acc": 0.6857295563869589, + "acc_stderr": 0.010735132285108176 + }, + "boolq": { + "acc": 0.5137614678899083, + "acc_stderr": 0.008741742106878655 + }, + "arc_easy": { + "acc": 0.5294612794612794, + "acc_stderr": 0.010241957728409683, + "acc_norm": 0.5105218855218855, + "acc_norm_stderr": 0.010257511546488227 + }, + "arc_challenge": { + "acc": 0.26791808873720135, + "acc_stderr": 0.012942030195136421, + "acc_norm": 0.3054607508532423, + "acc_norm_stderr": 0.013460080478002501 + }, + "sciq": { + "acc": 0.799, + "acc_stderr": 0.012679107214617331, + "acc_norm": 0.778, + "acc_norm_stderr": 0.013148721948877366 + }, + "piqa": { + "acc": 0.7328618063112078, + "acc_stderr": 0.010323440492612437, + "acc_norm": 0.7377584330794341, + "acc_norm_stderr": 0.010262502565172447 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..b11af2a2b21df78c7fbe0624c9997898bb3e381a --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.349, + "acc_stderr": 0.0150806639915631 + }, + "anli_r2": { + "acc": 0.335, + "acc_stderr": 0.014933117490932566 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.013759437498874075 + }, + "cb": { + "acc": 0.375, + "acc_stderr": 0.06527912098338669, + "f1": 0.25396825396825395 + }, + "copa": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794 + }, + "hellaswag": { + "acc": 0.45797649870543716, + "acc_stderr": 0.004972126523031945, + "acc_norm": 0.5928101971718781, + "acc_norm_stderr": 0.004903066639761946 + }, + "rte": { + "acc": 0.4981949458483754, + "acc_stderr": 0.030096267148976626 + }, + "winogrande": { + "acc": 0.5651144435674822, + "acc_stderr": 0.013932814110418013 + }, + "storycloze_2016": { + "acc": 0.6857295563869589, + "acc_stderr": 0.010735132285108176 + }, + "boolq": { + "acc": 0.5137614678899083, + "acc_stderr": 0.008741742106878655 + }, + "arc_easy": { + "acc": 0.5294612794612794, + "acc_stderr": 0.010241957728409683, + "acc_norm": 0.5105218855218855, + "acc_norm_stderr": 0.010257511546488227 + }, + "arc_challenge": { + "acc": 0.26791808873720135, + "acc_stderr": 0.012942030195136421, + "acc_norm": 0.3054607508532423, + "acc_norm_stderr": 0.013460080478002501 + }, + "sciq": { + "acc": 0.799, + "acc_stderr": 0.012679107214617331, + "acc_norm": 0.778, + "acc_norm_stderr": 0.013148721948877366 + }, + "piqa": { + "acc": 0.7328618063112078, + "acc_stderr": 0.010323440492612437, + "acc_norm": 0.7377584330794341, + "acc_norm_stderr": 0.010262502565172447 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8fccb7c1bfef42b9d3f433ddca9b0b1a4a931390 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.356, + "acc_stderr": 0.01514904265930662 + }, + "anli_r2": { + "acc": 0.366, + "acc_stderr": 0.01524061272640576 + }, + "anli_r3": { + "acc": 0.35583333333333333, + "acc_stderr": 0.013826518748493315 + }, + "cb": { + "acc": 0.5, + "acc_stderr": 0.06741998624632421, + "f1": 0.34491725768321513 + }, + "copa": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845 + }, + "hellaswag": { + "acc": 0.4587731527584147, + "acc_stderr": 0.004972790690640181, + "acc_norm": 0.5902210714997013, + "acc_norm_stderr": 0.004907877144720023 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5406471981057617, + "acc_stderr": 0.014005973823825136 + }, + "storycloze_2016": { + "acc": 0.689470871191876, + "acc_stderr": 0.010700112173178448 + }, + "boolq": { + "acc": 0.5131498470948013, + "acc_stderr": 0.008742030090044975 + }, + "arc_easy": { + "acc": 0.5231481481481481, + "acc_stderr": 0.010248782484554471, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.010259343705889728 + }, + "arc_challenge": { + "acc": 0.2713310580204778, + "acc_stderr": 0.012993807727545792, + "acc_norm": 0.310580204778157, + "acc_norm_stderr": 0.013522292098053054 + }, + "sciq": { + "acc": 0.795, + "acc_stderr": 0.012772554096113109, + "acc_norm": 0.783, + "acc_norm_stderr": 0.01304151375727071 + }, + "piqa": { + "acc": 0.7236126224156693, + "acc_stderr": 0.010434162388275624, + "acc_norm": 0.7312295973884657, + "acc_norm_stderr": 0.010343392940090011 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..8fccb7c1bfef42b9d3f433ddca9b0b1a4a931390 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.356, + "acc_stderr": 0.01514904265930662 + }, + "anli_r2": { + "acc": 0.366, + "acc_stderr": 0.01524061272640576 + }, + "anli_r3": { + "acc": 0.35583333333333333, + "acc_stderr": 0.013826518748493315 + }, + "cb": { + "acc": 0.5, + "acc_stderr": 0.06741998624632421, + "f1": 0.34491725768321513 + }, + "copa": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845 + }, + "hellaswag": { + "acc": 0.4587731527584147, + "acc_stderr": 0.004972790690640181, + "acc_norm": 0.5902210714997013, + "acc_norm_stderr": 0.004907877144720023 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5406471981057617, + "acc_stderr": 0.014005973823825136 + }, + "storycloze_2016": { + "acc": 0.689470871191876, + "acc_stderr": 0.010700112173178448 + }, + "boolq": { + "acc": 0.5131498470948013, + "acc_stderr": 0.008742030090044975 + }, + "arc_easy": { + "acc": 0.5231481481481481, + "acc_stderr": 0.010248782484554471, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.010259343705889728 + }, + "arc_challenge": { + "acc": 0.2713310580204778, + "acc_stderr": 0.012993807727545792, + "acc_norm": 0.310580204778157, + "acc_norm_stderr": 0.013522292098053054 + }, + "sciq": { + "acc": 0.795, + "acc_stderr": 0.012772554096113109, + "acc_norm": 0.783, + "acc_norm_stderr": 0.01304151375727071 + }, + "piqa": { + "acc": 0.7236126224156693, + "acc_stderr": 0.010434162388275624, + "acc_norm": 0.7312295973884657, + "acc_norm_stderr": 0.010343392940090011 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json new file mode 100644 index 0000000000000000000000000000000000000000..de8d79ccfcb5c2bd6a45d878223d0caca2846740 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.014933117490932572 + }, + "anli_r2": { + "acc": 0.344, + "acc_stderr": 0.015029633724408948 + }, + "anli_r3": { + "acc": 0.36916666666666664, + "acc_stderr": 0.01393666834928527 + }, + "cb": { + "acc": 0.4642857142857143, + "acc_stderr": 0.06724777654937658, + "f1": 0.3255198942180551 + }, + "copa": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316 + }, + "hellaswag": { + "acc": 0.45518820952001593, + "acc_stderr": 0.004969701081068371, + "acc_norm": 0.5893248356901015, + "acc_norm_stderr": 0.004909509538525173 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5438042620363063, + "acc_stderr": 0.01399845361092433 + }, + "storycloze_2016": { + "acc": 0.6964190272581507, + "acc_stderr": 0.010632901358518371 + }, + "boolq": { + "acc": 0.5165137614678899, + "acc_stderr": 0.008740284046486645 + }, + "arc_easy": { + "acc": 0.5378787878787878, + "acc_stderr": 0.010230299628864799, + "acc_norm": 0.5206228956228957, + "acc_norm_stderr": 0.010251052755716122 + }, + "arc_challenge": { + "acc": 0.26535836177474403, + "acc_stderr": 0.012902554762313967, + "acc_norm": 0.3037542662116041, + "acc_norm_stderr": 0.01343890918477876 + }, + "sciq": { + "acc": 0.806, + "acc_stderr": 0.012510816141264362, + "acc_norm": 0.777, + "acc_norm_stderr": 0.013169830843425672 + }, + "piqa": { + "acc": 0.719804134929271, + "acc_stderr": 0.01047812201557708, + "acc_norm": 0.7274211099020674, + "acc_norm_stderr": 0.010389256803296018 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_4_lm-eval_global_step84877_2023-05-15-10-07-33_4shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_4_lm-eval_global_step84877_2023-05-15-10-07-33_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..de8d79ccfcb5c2bd6a45d878223d0caca2846740 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_4_lm-eval_global_step84877_2023-05-15-10-07-33_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.014933117490932572 + }, + "anli_r2": { + "acc": 0.344, + "acc_stderr": 0.015029633724408948 + }, + "anli_r3": { + "acc": 0.36916666666666664, + "acc_stderr": 0.01393666834928527 + }, + "cb": { + "acc": 0.4642857142857143, + "acc_stderr": 0.06724777654937658, + "f1": 0.3255198942180551 + }, + "copa": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316 + }, + "hellaswag": { + "acc": 0.45518820952001593, + "acc_stderr": 0.004969701081068371, + "acc_norm": 0.5893248356901015, + "acc_norm_stderr": 0.004909509538525173 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5438042620363063, + "acc_stderr": 0.01399845361092433 + }, + "storycloze_2016": { + "acc": 0.6964190272581507, + "acc_stderr": 0.010632901358518371 + }, + "boolq": { + "acc": 0.5165137614678899, + "acc_stderr": 0.008740284046486645 + }, + "arc_easy": { + "acc": 0.5378787878787878, + "acc_stderr": 0.010230299628864799, + "acc_norm": 0.5206228956228957, + "acc_norm_stderr": 0.010251052755716122 + }, + "arc_challenge": { + "acc": 0.26535836177474403, + "acc_stderr": 0.012902554762313967, + "acc_norm": 0.3037542662116041, + "acc_norm_stderr": 0.01343890918477876 + }, + "sciq": { + "acc": 0.806, + "acc_stderr": 0.012510816141264362, + "acc_norm": 0.777, + "acc_norm_stderr": 0.013169830843425672 + }, + "piqa": { + "acc": 0.719804134929271, + "acc_stderr": 0.01047812201557708, + "acc_norm": 0.7274211099020674, + "acc_norm_stderr": 0.010389256803296018 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a05a37eb46126eba95cca470120f827097cad7d5 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.354, + "acc_stderr": 0.015129868238451772 + }, + "anli_r2": { + "acc": 0.36, + "acc_stderr": 0.015186527932040122 + }, + "anli_r3": { + "acc": 0.365, + "acc_stderr": 0.013903485981413582 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.33071988595866 + }, + "copa": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845 + }, + "hellaswag": { + "acc": 0.4561840270862378, + "acc_stderr": 0.004970585328297622, + "acc_norm": 0.5903206532563234, + "acc_norm_stderr": 0.004907694727935688 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.03009469812323996 + }, + "winogrande": { + "acc": 0.5406471981057617, + "acc_stderr": 0.014005973823825124 + }, + "storycloze_2016": { + "acc": 0.6990913949759487, + "acc_stderr": 0.010606289538707334 + }, + "boolq": { + "acc": 0.5116207951070336, + "acc_stderr": 0.008742692742551265 + }, + "arc_easy": { + "acc": 0.5269360269360269, + "acc_stderr": 0.01024488474062011, + "acc_norm": 0.5122053872053872, + "acc_norm_stderr": 0.010256726235129012 + }, + "arc_challenge": { + "acc": 0.27559726962457337, + "acc_stderr": 0.013057169655761838, + "acc_norm": 0.30119453924914674, + "acc_norm_stderr": 0.013406741767847617 + }, + "sciq": { + "acc": 0.817, + "acc_stderr": 0.012233587399477823, + "acc_norm": 0.79, + "acc_norm_stderr": 0.01288666233227455 + }, + "piqa": { + "acc": 0.720348204570185, + "acc_stderr": 0.01047189953030656, + "acc_norm": 0.7252448313384113, + "acc_norm_stderr": 0.010415033676676056 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_5_lm-eval_global_step84877_2023-05-15-10-07-44_5shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_5_lm-eval_global_step84877_2023-05-15-10-07-44_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..a05a37eb46126eba95cca470120f827097cad7d5 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_5_lm-eval_global_step84877_2023-05-15-10-07-44_5shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.354, + "acc_stderr": 0.015129868238451772 + }, + "anli_r2": { + "acc": 0.36, + "acc_stderr": 0.015186527932040122 + }, + "anli_r3": { + "acc": 0.365, + "acc_stderr": 0.013903485981413582 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.33071988595866 + }, + "copa": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845 + }, + "hellaswag": { + "acc": 0.4561840270862378, + "acc_stderr": 0.004970585328297622, + "acc_norm": 0.5903206532563234, + "acc_norm_stderr": 0.004907694727935688 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.03009469812323996 + }, + "winogrande": { + "acc": 0.5406471981057617, + "acc_stderr": 0.014005973823825124 + }, + "storycloze_2016": { + "acc": 0.6990913949759487, + "acc_stderr": 0.010606289538707334 + }, + "boolq": { + "acc": 0.5116207951070336, + "acc_stderr": 0.008742692742551265 + }, + "arc_easy": { + "acc": 0.5269360269360269, + "acc_stderr": 0.01024488474062011, + "acc_norm": 0.5122053872053872, + "acc_norm_stderr": 0.010256726235129012 + }, + "arc_challenge": { + "acc": 0.27559726962457337, + "acc_stderr": 0.013057169655761838, + "acc_norm": 0.30119453924914674, + "acc_norm_stderr": 0.013406741767847617 + }, + "sciq": { + "acc": 0.817, + "acc_stderr": 0.012233587399477823, + "acc_norm": 0.79, + "acc_norm_stderr": 0.01288666233227455 + }, + "piqa": { + "acc": 0.720348204570185, + "acc_stderr": 0.01047189953030656, + "acc_norm": 0.7252448313384113, + "acc_norm_stderr": 0.010415033676676056 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file