import gradio as gr import pandas as pd import json from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message from datetime import datetime, timezone LAST_UPDATED = "Sep 9th 2023" column_names = { "MODEL": "Model", "Avg. WER": "Average WER ⬇️", "RTF": "RTF (1e-3) ⬇️", "Common Voice WER": "Common Voice WER ⬇️", "D_AVG_CV_WER": "Delta AVG-CV WER", } # Skipping testings just uing the numbers computed in the original space for my sanity sake # eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub() # if not csv_results.exists(): # raise Exception(f"CSV file {csv_results} does not exist locally") # # Get csv with data and parse columns # original_df = pd.read_csv(csv_results) data = [ ["nvidia/stt_en_fastconformer_transducer_xlarge", 12.3, 8.06, 7.26], ["nvidia/stt_en_fastconformer_transducer_xxlarge", 14.4, 8.07, 6.07], ["openai/whisper-large-v2", 12.7, 8.16, 10.12], ["nvidia/stt_en_fastconformer_ctc_xxlarge", 5, 8.34, 8.31], ["nvidia/stt_en_conformer_ctc_large", 7.5, 8.39, 9.1], ["openai/whisper-medium.en", 10.7, 8.5, 11.96], ["nvidia/stt_en_fastconformer_ctc_xlarge", 2.9, 8.52, 7.51], ["nvidia/stt_en_fastconformer_ctc_large", 1.8, 8.9, 8.56], ["nvidia/stt_en_fastconformer_transducer_large", 10.4, 8.94, 8.04], ["openai/whisper-large", 12.7, 9.2, 10.92], ["nvidia/stt_en_conformer_transducer_large", 21.8, 9.27, 7.36], ["openai/whisper-small.en", 8.3, 9.34, 15.13], ["nvidia/stt_en_conformer_transducer_small", 17.7, 10.81, 14.35], ["openai/whisper-base.en", 7.2, 11.67, 21.77], ["nvidia/stt_en_conformer_ctc_small", 3.2, 11.77, 16.59], ["patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram", 20.1, 13.65, 20.05], ["facebook/wav2vec2-large-960h-lv60-self", 2.5, 14.47, 22.15], ["openai/whisper-tiny.en", 9.1, 14.96, 31.09], ["patrickvonplaten/hubert-xlarge-ls960-ft-4-gram", 24.5, 15.11, 19.16], ["speechbrain/asr-wav2vec2-librispeech", 2.6, 15.61, 23.71], ["facebook/hubert-xlarge-ls960-ft", 6.3, 15.81, 22.05], ["facebook/mms-1b-all", 5.9, 15.85, 21.23], ["facebook/hubert-large-ls960-ft", 2.6, 15.93, 23.12], ["facebook/wav2vec2-large-robust-ft-libri-960h", 2.7, 16.07, 22.57], ["facebook/wav2vec2-conformer-rel-pos-large-960h-ft", 5.2, 17, 23.01], ["facebook/wav2vec2-conformer-rope-large-960h-ft", 7.8, 17.06, 23.08], ["facebook/wav2vec2-large-960h", 1.8, 21.76, 34.01], ["facebook/wav2vec2-base-960h", 1.2, 26.41, 41.75] ] columns = [ "Model", "RTF (1e-3) ⬇️", "Average WER ⬇️", "Common Voice WER ⬇️" ] original_df = pd.DataFrame(data, columns=columns) # Formats the columns def formatter(x): x = round(x, 2) return x for col in original_df.columns: if col == "model": original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x))) else: original_df[col] = original_df[col].apply(formatter) # For numerical values original_df.rename(columns=column_names, inplace=True) original_df.sort_values(by='Common Voice', inplace=True) # Compute delta between average WER and CV WER original_df['Detla Avg. C.V. WER'] = original_df['Average WER ⬇️'] - original_df['Common Voice WER ⬇️'] original_df['Detla Avg. C.V. WER'] = original_df['Detla Avg. C.V. WER'].apply(formatter) COLS = [c.name for c in fields(AutoEvalColumn)] TYPES = [c.type for c in fields(AutoEvalColumn)] def request_model(model_text, chbcoco2017): # Determine the selected checkboxes dataset_selection = [] if chbcoco2017: dataset_selection.append("ESB Datasets tests only") if len(dataset_selection) == 0: return styled_error("You need to select at least one dataset") base_model_on_hub, error_msg = is_model_on_hub(model_text) if not base_model_on_hub: return styled_error(f"Base model '{model_text}' {error_msg}") # Construct the output dictionary current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") required_datasets = ', '.join(dataset_selection) eval_entry = { "date": current_time, "model": model_text, "datasets_selected": required_datasets } # Prepare file path DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True) fn_datasets = '@ '.join(dataset_selection) filename = model_text.replace("/","@") + "@@" + fn_datasets if filename in requested_models: return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.") try: filename_ext = filename + ".txt" out_filepath = DIR_OUTPUT_REQUESTS / filename_ext # Write the results to a text file with open(out_filepath, "w") as f: f.write(json.dumps(eval_entry)) upload_file(filename, out_filepath) # Include file in the list of uploaded files requested_models.append(filename) # Remove the local file out_filepath.unlink() return styled_message("🤗 Your request has been submitted and will be evaluated soon!

") except Exception as e: return styled_error(f"Error submitting request!") with gr.Blocks() as demo: gr.HTML(BANNER, elem_id="banner") gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") CUSTOM_MESSAGE="""Legend: This space is a fork of the original [hf-audio/open_asr_leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). It aims to show how CommonVoice Test Set is large enough for most languages to give a relativly good approximation of the average WER/CER but at a much lower computational cost. #### Why is this useful? Because, it gives us a standardized test set for most languages allowing us to programatically choose a relatively good model for any CV supported languages. `Model`, `RTF (1e-3) ⬇️` and`Average WER ⬇️` were reported from [hf-audio/open_asr_leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard) the 9 using version from the 7 Sept. 2023. ### Results CommonVoice Test test give a word error rate (WER) within less than 20 points of the average WER. It's not good. Don't use only CommonVoice to choose the most adequate architecture. But to quickly find a suitable ASR model for a large panel of lanugages in a programatic fashion, it's close enough.""" gr.Markdown(CUSTOM_MESSAGE, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0): leaderboard_table = gr.components.Dataframe( value=original_df, datatype=TYPES, max_rows=None, elem_id="leaderboard-table", interactive=False, visible=True, ) with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1): gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") with gr.TabItem("✉️✨ Request a model here!", elem_id="od-benchmark-tab-table", id=2): with gr.Column(): gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text") with gr.Column(): gr.Markdown("Select a dataset:", elem_classes="markdown-text") with gr.Column(): model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)") chb_coco2017 = gr.Checkbox(label="COCO validation 2017 dataset", visible=False, value=True, interactive=False) with gr.Column(): mdw_submission_result = gr.Markdown() btn_submitt = gr.Button(value="🚀 Request") btn_submitt.click(request_model, [model_name_textbox, chb_coco2017], mdw_submission_result) gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_TEXT, lines=7, label="Copy the BibTeX snippet to cite this source", elem_id="citation-button", ).style(show_copy_button=True) demo.launch()