task: ifeval dataset_path: wis-k/instruction-following-eval dataset_name: null output_type: generate_until test_split: train num_fewshot: 0 doc_to_text: prompt doc_to_target: 0 generation_kwargs: until: [] do_sample: false temperature: 0.0 max_gen_toks: 1024 process_results: !function utils.process_results metric_list: - metric: prompt_level_strict_acc aggregation: mean higher_is_better: true - metric: inst_level_strict_acc aggregation: !function utils.agg_inst_level_acc higher_is_better: true - metric: prompt_level_loose_acc aggregation: mean higher_is_better: true - metric: inst_level_loose_acc aggregation: !function utils.agg_inst_level_acc higher_is_better: true metadata: - version: 1.0