import json import os import shutil import sys from collections import defaultdict import numpy as np import pandas as pd from sklearn.metrics import confusion_matrix from utils import compute_average_wer, download_dataset def main(): """ Main function to orchestrate the multilingual data generation process. This function performs the following steps: 1. Downloads multilingual evaluation data if requested. 2. Processes multilingual evaluation files. 3. Calculates and saves results, including Word Error Rate (WER) and language detection confusion matrices. """ source_repo = "argmaxinc/whisperkit-evals-multilingual" source_subfolder = "WhisperKit" source_directory = f"{source_repo}/{source_subfolder}" if len(sys.argv) > 1 and sys.argv[1] == "download": try: shutil.rmtree(source_repo) except: print("Nothing to remove.") download_dataset(source_repo, source_repo, source_subfolder) results = defaultdict( lambda: { "average_wer": [], "language_wer": defaultdict(list), "language_detection": [], } ) confusion_matrices = {} for subdir, _, files in os.walk(source_directory): for filename in files: if not filename.endswith(".json") or "summary" in filename: continue file_path = os.path.join(subdir, filename) with open(file_path, "r") as f: data = json.load(f) subdir_components = subdir.split(os.path.sep) is_forced = "forced" in subdir_components model = subdir_components[-3] if not is_forced else subdir_components[-4] key = f"{model}/{'forced' if is_forced else 'not_forced'}" for item in data["results"]: if "reference_language" not in item: continue reference_language = item["reference_language"] wer = item["wer"] detected_language = item["predicted_language"] result = { "reference": item["reference"], "prediction": item["prediction"], } results[key]["average_wer"].append(result) results[key]["language_wer"][reference_language].append(result) results[key]["language_detection"].append( (reference_language, detected_language) ) calculate_and_save_results(results, confusion_matrices) def calculate_and_save_results(results, confusion_matrices): """ Calculates final multilingual metrics and saves them to CSV and JSON files. :param results: Dictionary containing raw multilingual evaluation data. :param confusion_matrices: Dictionary to store confusion matrices for language detection. This function processes the raw multilingual data, calculates average metrics, creates confusion matrices for language detection, and saves the results to: 1. A CSV file with WER data for each model and language. 2. A JSON file with confusion matrices for language detection. """ wer_data = [] for key, data in results.items(): model, forced = key.rsplit("/", 1) row = { "Model": model, "Forced Tokens": forced == "forced", "Average WER": compute_average_wer(data["average_wer"]), } for lang, wers in data["language_wer"].items(): row[f"WER_{lang}"] = compute_average_wer(wers) wer_data.append(row) true_languages, detected_languages = zip(*data["language_detection"]) unique_languages = sorted(set(true_languages)) cm = confusion_matrix( true_languages, detected_languages, labels=unique_languages ) row_sums = cm.sum(axis=1) cm_normalized = np.zeros_like(cm, dtype=float) non_zero_rows = row_sums != 0 cm_normalized[non_zero_rows] = ( cm[non_zero_rows] / row_sums[non_zero_rows, np.newaxis] ) if model not in confusion_matrices: confusion_matrices[model] = {} confusion_matrices[model][forced] = { "matrix": cm_normalized.tolist(), "labels": unique_languages, } df = pd.DataFrame(wer_data) df.to_csv("dashboard_data/multilingual_results.csv", index=False) with open("dashboard_data/multilingual_confusion_matrices.json", "w") as f: json.dump(confusion_matrices, f, indent=2) if __name__ == "__main__": main()