# Author: Martin Fajcik import argparse import copy import glob import hashlib import os import json import re import jsonlines from tqdm import tqdm SUPPORTED_METRICS = [ "avg_mcauroc", # for classification tasks "exact_match", # for QA tasks "acc", # for multichoice tasks "rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks "rouge_raw_r2_mid_f", # for summarization tasks, older metric version for back compatibility "word_perplexity", # for language modeling tasks ] EXTRA_INFO_RELEASE_KEYS = [ 'filtered_resps', 'doc_id', ] with open("leaderboard/metadata.json", "r") as f: METADATA = json.load(f) # TASK MAP # from promptname to taskname MAP = { 'benchmark_agree': 'benczechmark_agree', 'benchmark_belebele': 'benczechmark_belebele', 'benchmark_czechnews': 'benczechmark_czechnews', 'benchmark_subjectivity': 'benczechmark_subjectivity', 'benczechmark_snli': 'benczechmark_snli', 'propaganda_argumentace': 'benczechmark_propaganda_argumentace', 'propaganda_fabulace': 'benczechmark_propaganda_fabulace', 'propaganda_nazor': 'benczechmark_propaganda_nazor', 'propaganda_strach': 'benczechmark_propaganda_strach', 'propaganda_zamereni': 'benczechmark_propaganda_zamereni', 'propaganda_demonizace': 'benczechmark_propaganda_demonizace', 'propaganda_lokace': 'benczechmark_propaganda_lokace', 'propaganda_relativizace': 'benczechmark_propaganda_relativizace', 'propaganda_vina': 'benczechmark_propaganda_vina', 'propaganda_zanr': 'benczechmark_propaganda_zanr', 'propaganda_emoce': 'benczechmark_propaganda_emoce', 'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani', 'propaganda_rusko': 'benczechmark_propaganda_rusko', 'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall', 'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb', 'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd', 'benczechmark_summarization': 'benczechmark_summarization', 'gec': 'benczechmark_grammarerrorcorrection', 'cs_nq_open': 'benczechmark_cs_naturalquestions', 'cs_sqad_open': 'benczechmark_cs_sqad32', 'cs_triviaqa': 'benczechmark_cs_triviaQA', 'csfever': 'benczechmark_csfever_nli', 'ctkfacts': 'benczechmark_ctkfacts_nli', 'cnec_ner': 'benczechmark_cs_ner', 'cdec_ner': 'benczechmark_cs_court_decisions_ner', 'klokan_qa': 'benczechmark_klokan_qa', 'umimeto_biology': 'benczechmark_umimeto_biology', 'umimeto_chemistry': 'benczechmark_umimeto_chemistry', 'umimeto_czech': 'benczechmark_umimeto_czech', 'umimeto_history': 'benczechmark_umimeto_history', 'umimeto_informatics': 'benczechmark_umimeto_informatics', 'umimeto_math': 'benczechmark_umimeto_math', 'umimeto_physics': 'benczechmark_umimeto_physics', 'cermat_czech_open': 'benczechmark_cermat_czech_open', 'cermat_czech_mc': 'benczechmark_cermat_czech_mc', 'cermat_czech_tf': 'benczechmark_cermat_czech_tf', 'cermat_czmath_open': 'benczechmark_cermat_czmath_open', 'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc', 'history_ir': 'benczechmark_history_ir', 'benczechmark_histcorpus': "benczechmark_histcorpus", 'benczechmark_hellaswag': "benczechmark_hellaswag", 'benczechmark_essay': 'benczechmark_essay', 'benczechmark_fiction': 'benczechmark_fiction', 'benczechmark_capek': 'benczechmark_capek', 'benczechmark_correspondence': 'benczechmark_correspondence', 'benczechmark_havlicek': 'benczechmark_havlicek', 'benczechmark_speeches': 'benczechmark_speeches', 'benczechmark_spoken': 'benczechmark_spoken', 'benczechmark_dialect': 'benczechmark_dialect' } NO_PROMPT_TASKS = ["benczechmark_histcorpus", "benczechmark_hellaswag", "benczechmark_essay", "benczechmark_fiction", "benczechmark_capek", "benczechmark_correspondence", "benczechmark_havlicek", "benczechmark_speeches", "benczechmark_spoken", "benczechmark_dialect"] def resolve_taskname(taskname): if taskname not in MAP: raise ValueError(f"Taskname {taskname} not found.") return MAP[taskname] def rename_keys(d, resolve_taskname): orig_len = len(d) for k, v in list(d.items()): new_key = resolve_taskname(k) d[new_key] = d.pop(k) # make sure list length didnt changed assert len(d) == orig_len def process_harness_logs(input_folders, output_file): """ - Selects best prompt for each task - Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics """ def expand_input_folders(input_folders): # Check if input_folders is a wildcard pattern if '*' in input_folders or '?' in input_folders: # Expand the wildcard into a list of matching directories matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)] return matching_directories else: # If it's not a wildcard, return the input as a single-item list if it's a valid directory if os.path.isdir(input_folders): return [input_folders] else: return [] input_folders = expand_input_folders(input_folders) per_task_results = {} metric_per_task = {} predictions = {} all_harness_results = dict() for input_folder in tqdm(input_folders, desc="Loading files"): # read all files in input_folder # consider first folder within this folder input_folder = os.path.join(input_folder, os.listdir(input_folder)[0]) # find file which starts with results... prefix in the input_folder result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0] with open(os.path.join(input_folder, result_file), "r") as f: harness_results = json.load(f) all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results current_multipleprompt_tasknames = [] for name, result in harness_results['results'].items(): if name in NO_PROMPT_TASKS: # not prompts taskname = name # process metric names for k, v in copy.deepcopy(result).items(): if "," in k: name, _ = k.split(",") del result[k] result[name] = v per_task_results[taskname] = result if result['alias'].strip().startswith('- prompt-'): # process taskname taskname = name[:-1] if taskname.endswith("_"): taskname = taskname[:-1] # process metric names for k, v in copy.deepcopy(result).items(): if "," in k: name, key = k.split(",") del result[k] result[name] = v if taskname not in per_task_results: per_task_results[taskname] = [result] current_multipleprompt_tasknames.append(taskname) else: per_task_results[taskname].append(result) # get best result according to metric priority given in SUPPORTED_METRICS list for taskname, results in per_task_results.items(): if not taskname in current_multipleprompt_tasknames: continue best_result = None target_metric = None for m in SUPPORTED_METRICS: if m in results[0]: target_metric = m break if target_metric is None: raise ValueError(f"No supported metric found in {taskname}") metric_per_task[taskname] = target_metric all_measured_results = [] for result in results: all_measured_results.append(result[target_metric]) if best_result is None: best_result = result else: if result[target_metric] > best_result[target_metric]: best_result = result # Compute max-centered variance max_value = best_result[target_metric] squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results] max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1) best_result['max_centered_variance'] = max_centered_variance per_task_results[taskname] = best_result for file in os.listdir(input_folder): if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"): continue for taskname in per_task_results.keys(): if taskname in file: print(f"Processing {os.path.join(input_folder, file)} for {taskname}") # check this file corresponds to same prompt winning_prompt = per_task_results[taskname]['alias'][-1] if taskname in NO_PROMPT_TASKS: current_prompt = "-1" else: try: current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1) except AttributeError: raise ValueError(f"Prompt not found in {file}") if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS: # load file contents predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file))) # only keep data necessary for metrics for prediction in predictions[taskname]: for key in list(prediction.keys()): if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS: del prediction[key] # rename keys (tasknames) using resolve_tasknames: rename_keys(predictions, resolve_taskname) rename_keys(per_task_results, resolve_taskname) # assert keys in predictions and results are the same # assert set(predictions.keys()) == set(per_task_results.keys()) if not set(predictions.keys()) == set(per_task_results.keys()): # print missing keys print("Missing keys in predictions:") print(set(predictions.keys()) - set(per_task_results.keys())) # print extra keys print("Extra keys in predictions:") print(set(per_task_results.keys()) - set(predictions.keys())) raise ValueError("Keys in predictions and results are not the same") aggregated_predictions = dict() aggregated_predictions["predictions"] = predictions aggregated_predictions["results"] = per_task_results aggregated_predictions["metadata"] = { 'git_hash': harness_results['git_hash'], 'transformers_version': harness_results['transformers_version'], 'tokenizer_pad_token': harness_results['tokenizer_pad_token'], 'tokenizer_eos_token': harness_results['tokenizer_eos_token'], 'tokenizer_bos_token': harness_results['tokenizer_bos_token'], 'eot_token_id': harness_results['eot_token_id'], 'max_length': harness_results['max_length'], 'task_hashes': harness_results['task_hashes'], 'model_source': harness_results['model_source'], 'model_name': harness_results['model_name'], 'model_name_sanitized': harness_results['model_name_sanitized'], 'system_instruction': harness_results['system_instruction'], 'system_instruction_sha': harness_results['system_instruction_sha'], 'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'], 'chat_template': harness_results['chat_template'], 'chat_template_sha': harness_results['chat_template_sha'], 'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()}, 'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0'] } # make sure all tasks are present all_tasks = set(METADATA["tasks"].keys()) all_expected_tasks = set(per_task_results.keys()) all_missing_tasks = all_tasks - all_expected_tasks all_extra_tasks = all_expected_tasks - all_tasks if len(all_missing_tasks) > 0: EOLN = "\n" # print(f"Missing tasks: {EOLN.join(all_missing_tasks)}") raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}") # TODO: uncomment if len(all_extra_tasks) > 0: EOLN = "\n" raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}") with open(output_file, "w") as f: json.dump(aggregated_predictions, f) print("Success!") print("Output saved to", output_file) def main(): parser = argparse.ArgumentParser( description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.") parser.add_argument("-i", "-f", "--input_folder", "--folder", help="Folder with unprocessed results from lm harness.", required=True) parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True) args = parser.parse_args() process_harness_logs(args.input_folder, args.output_file) if __name__ == "__main__": main()