Spaces:

justus-tobias
/

ASR_Model_Comparison

Sleeping

App Files Files Community

j-tobias commited on Aug 12, 2024

Commit

d521dce

1 Parent(s): 09b2769

small updates and improved complexity

Browse files

Files changed (7) hide show

__pycache__/dataset.cpython-310.pyc +0 -0
__pycache__/model.cpython-310.pyc +0 -0
__pycache__/processing.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +33 -34
processing.py +74 -29
utils.py +0 -35

__pycache__/dataset.cpython-310.pyc DELETED Viewed

Binary file (3.34 kB)

__pycache__/model.cpython-310.pyc DELETED Viewed

Binary file (3.74 kB)

__pycache__/processing.cpython-310.pyc DELETED Viewed

Binary file (4.24 kB)

__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (1.04 kB)

app.py CHANGED Viewed

@@ -1,50 +1,34 @@
 import gradio as gr
 from processing import run
-# from utils import hf_login
-# hf_login()
 from huggingface_hub import login
 import os
 hf_token = os.getenv("HF_Token")
 login(hf_token)
-MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr"]
-DATASET_OPTIONS = ["Common Voice", "VoxPopuli", "OWN Recoding/Sample"]
-# def eval(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str)->str:
-#     print("OWN AUDIO: ", type(own_audio), own_audio)
-#     wer_result_1, wer_result_2, references, transcriptions1, transcriptions2 = run(data_subset, model_1, model_2, own_audio, own_transcription)
-#     results_md = f"""#### {model_1}
-#     - WER Score: {wer_result_1}
-#     #### {model_2}
-#     - WER Score: {wer_result_2}"""
-#     # Create the bar plot
-#     fig = go.Figure(
-#         data=[
-#             go.Bar(x=[f"{model_1}"], y=[wer_result_1]),
-#             go.Bar(x=[f"{model_2}"], y=[wer_result_2]),
-#         ]
-#     )
-#     # Update the layout for better visualization
-#     fig.update_layout(
-#         title="Comparison of Two Models",
-#         xaxis_title="Models",
-#         yaxis_title="Value",
-#         barmode="group",
-#     )
-#     return results_md, fig
 def get_card(selected_model:str)->str:
     with open("cards.txt", "r") as f:
@@ -63,6 +47,14 @@ def is_own(selected_option):
     else:
         return gr.update(visible=False), gr.update(visible=False)
 with gr.Blocks() as demo:
@@ -126,7 +118,14 @@ Happy experimenting and comparing! 🚀""")
     gr.Markdown('## <p style="text-align: center;">Results</p>')
     results_md = gr.Markdown("")
-    results_plot = gr.Plot(show_label=False)
-    eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot])
 demo.launch(debug=True)

 import gradio as gr
 from processing import run
+import json
 from huggingface_hub import login
 import os
+# LOG INTO HUGGING FACE
 hf_token = os.getenv("HF_Token")
 login(hf_token)
+def hf_login():
+    hf_token = os.getenv("HF_Token")
+    if hf_token is None:
+        with open("credentials.json", "r") as f:
+            hf_token = json.load(f)["token"]
+    login(token=hf_token, add_to_git_credential=True)
+# hf_login()
+# GENERAL OPTIONS FOR MODELS AND DATASETS
+MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr"]
+DATASET_OPTIONS = ["Common Voice", "OWN Recoding/Sample"]
+# HELPER FUNCTIONS
 def get_card(selected_model:str)->str:
     with open("cards.txt", "r") as f:
     else:
         return gr.update(visible=False), gr.update(visible=False)
+def make_visible():
+    return gr.update(visible=True), gr.update(visible=True)
+# THE ACTUAL APP
 with gr.Blocks() as demo:
     gr.Markdown('## <p style="text-align: center;">Results</p>')
     results_md = gr.Markdown("")
+    results_plot = gr.Plot(show_label=False, visible=False)
+    results_df = gr.DataFrame(
+        visible=False,
+        row_count=(5, "dynamic"),  # Allow dynamic rows
+        interactive=False,  # Allow users to interact with the DataFrame
+        wrap=True,  # Ensure text wraps to multiple lines
+    )
+    eval_btn.click(make_visible, outputs=[results_plot, results_df])
+    eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
 demo.launch(debug=True)

processing.py CHANGED Viewed

@@ -3,10 +3,12 @@ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcess
 import plotly.graph_objs as go
 from datasets import load_dataset
 from datasets import Audio
-from transformers import pipeline
 import evaluate
 import librosa
 import numpy as np
 wer_metric = evaluate.load("wer")
@@ -33,11 +35,12 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
         dataset, text_column = load_Common_Voice()
     print("Dataset Loaded")
-    # check if models are the same
     model1, processor1 = load_model(model_1)
     model2, processor2 = load_model(model_2)
     print("Models Loaded")
     if data_subset == "OWN Recoding/Sample":
         sample = {"audio":{"array":audio,"sampling_rate":16000}}
         transcription1 = model_compute(model1, processor1, sample, model_1)
@@ -46,11 +49,12 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
         transcriptions1 = [transcription1]
         transcriptions2 = [transcription2]
         references = [own_transcription]
         wer1 = compute_wer(references, transcriptions1)
         wer2 = compute_wer(references, transcriptions2)
-        results_md = f"""#### {model_1}
         - WER Score: {wer1}
         #### {model_2}
@@ -59,8 +63,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
         # Create the bar plot
         fig = go.Figure(
             data=[
-                go.Bar(x=[f"{model_1}"], y=[wer1]),
-                go.Bar(x=[f"{model_2}"], y=[wer2]),
             ]
         )
         # Update the layout for better visualization
@@ -71,14 +75,20 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
             barmode="group",
         )
-        yield results_md, fig
     else:
         references = []
         transcriptions1 = []
         transcriptions2 = []
         counter = 0
-        for sample in dataset:
             print(counter)
             counter += 1
@@ -89,25 +99,30 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
                 transcriptions1.append(transcription)
                 transcriptions2.append(transcription)
-            else:
-                transcriptions1.append(model_compute(model1, processor1, sample, model_1))
-                transcriptions2.append(model_compute(model2, processor2, sample, model_2))
-            wer1 = compute_wer(references, transcriptions1)
-            wer2 = compute_wer(references, transcriptions2)
-            results_md = f"""#### {model_1}
-            - WER Score: {wer1}
             #### {model_2}
-            - WER Score: {wer2}"""
             # Create the bar plot
             fig = go.Figure(
                 data=[
-                    go.Bar(x=[f"{model_1}"], y=[wer1]),
-                    go.Bar(x=[f"{model_2}"], y=[wer2]),
                 ]
             )
@@ -119,32 +134,62 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
                 barmode="group",
             )
-            yield results_md, fig
 # DATASET LOADERS
 def load_Common_Voice():
     dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
     text_column = "sentence"
-    dataset = dataset.take(100)
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
     dataset = list(dataset)
     return dataset, text_column
 def load_Vox_Populi():
-    dataset = dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
     print(next(iter(dataset)))
-    text_column = "raw_text"
-    dataset = dataset.take(100)
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
     dataset = list(dataset)
     return dataset, text_column
 # MODEL LOADERS
@@ -154,7 +199,7 @@ def load_model(model_id:str):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
     elif model_id == "facebook/s2t-medium-librispeech-asr":
         model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
-        processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
     else:
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -178,7 +223,7 @@ def model_compute(model, processor, sample, model_id):
         attention_mask = features.attention_mask
         gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
         transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
-        return transcription[0]
     else:
         return model(sample)
@@ -186,7 +231,7 @@ def model_compute(model, processor, sample, model_id):
 # UTILS
 def compute_wer(references, predictions):
     wer = wer_metric.compute(references=references, predictions=predictions)
-    wer = round(100 * wer, 2)
     return wer

 import plotly.graph_objs as go
 from datasets import load_dataset
 from datasets import Audio
 import evaluate
 import librosa
 import numpy as np
+import pandas as pd
+N_SAMPLES = 30
 wer_metric = evaluate.load("wer")
         dataset, text_column = load_Common_Voice()
     print("Dataset Loaded")
     model1, processor1 = load_model(model_1)
     model2, processor2 = load_model(model_2)
     print("Models Loaded")
     if data_subset == "OWN Recoding/Sample":
         sample = {"audio":{"array":audio,"sampling_rate":16000}}
         transcription1 = model_compute(model1, processor1, sample, model_1)
         transcriptions1 = [transcription1]
         transcriptions2 = [transcription2]
         references = [own_transcription]
         wer1 = compute_wer(references, transcriptions1)
         wer2 = compute_wer(references, transcriptions2)
+        results_md = f"""
+        #### {model_1}
         - WER Score: {wer1}
         #### {model_2}
         # Create the bar plot
         fig = go.Figure(
             data=[
+                go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
+                go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
             ]
         )
         # Update the layout for better visualization
             barmode="group",
         )
+        df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":[wer1],"transcriptions 2":transcriptions2,"WER 2":[wer2]})
+        yield results_md, fig, df
     else:
         references = []
         transcriptions1 = []
         transcriptions2 = []
+        WER1s = []
+        WER2s = []
         counter = 0
+        for i, sample in enumerate(dataset, start=1):
             print(counter)
             counter += 1
                 transcriptions1.append(transcription)
                 transcriptions2.append(transcription)
+            else:
+                transcription1 = model_compute(model1, processor1, sample, model_1)
+                transcription2 = model_compute(model2, processor2, sample, model_2)
+                transcriptions1.append(transcription1)
+                transcriptions2.append(transcription2)
+            WER1s.append(compute_wer([sample[text_column]], [transcription1]))
+            WER2s.append(compute_wer([sample[text_column]], [transcription2]))
+            results_md = f"""
+            {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
+            #### {model_1}
+            - WER Score: {sum(WER1s)/N_SAMPLES}
             #### {model_2}
+            - WER Score: {sum(WER2s)/N_SAMPLES}"""
             # Create the bar plot
             fig = go.Figure(
                 data=[
+                    go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/N_SAMPLES], showlegend=False),
+                    go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/N_SAMPLES], showlegend=False),
                 ]
             )
                 barmode="group",
             )
+            df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s})
+            yield results_md, fig, df
 # DATASET LOADERS
 def load_Common_Voice():
     dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
     text_column = "sentence"
+    dataset = dataset.take(N_SAMPLES)
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
     dataset = list(dataset)
     return dataset, text_column
 def load_Vox_Populi():
+    # Load the dataset in streaming mode
+    dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
+    # Optionally, preview the first item to understand the structure (can be removed in production)
     print(next(iter(dataset)))
+    # Take the first 120 examples to work with
+    dataset = dataset.take(N_SAMPLES+20)
+    text_column = "normalized_text"
+    # Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
+    dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
+    # Take the first 100 examples after filtering
+    dataset = dataset.take(N_SAMPLES)
+    # Cast the 'audio' column to the desired sampling rate
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+    # Convert to list and return
     dataset = list(dataset)
     return dataset, text_column
+def is_valid_sample(text, audio):
+    # Check if 'normalized_text' is valid
+    text = text.strip()
+    if text == "" or text == "ignore time segment in scoring":
+        return False
+    # Check if the 'audio' array is valid (not empty and meets length criteria)
+    if len(audio['array']) == 0:  # Audio is empty
+        return False
+    # Optionally, check if the audio duration is within a certain range
+    duration = audio['array'].size / audio['sampling_rate']
+    if duration < 1.0 or duration > 60.0:  # Example: Filter out audio shorter than 1 second or longer than 60 seconds
+        return False
+    return True
 # MODEL LOADERS
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
     elif model_id == "facebook/s2t-medium-librispeech-asr":
         model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
+        processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
     else:
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         attention_mask = features.attention_mask
         gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
         transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
+        return transcription
     else:
         return model(sample)
 # UTILS
 def compute_wer(references, predictions):
     wer = wer_metric.compute(references=references, predictions=predictions)
+    wer = round(N_SAMPLES * wer, 2)
     return wer

utils.py DELETED Viewed

@@ -1,35 +0,0 @@
-from huggingface_hub import login
-import json
-import evaluate
-import os
-def hf_login():
-    hf_token = os.getenv("HF_Token")
-    print(hf_token)
-    if hf_token is None:
-        with open("credentials.json", "r") as f:
-            hf_token = json.load(f)["token"]
-    login(token=hf_token, add_to_git_credential=True)
-def data(dataset):
-    for i, item in enumerate(dataset):
-        yield {**item["audio"], "reference": item["norm_text"]}
-def compute_wer(references, predictions):
-    wer_metric = evaluate.load("wer")
-    wer = wer_metric.compute(references=references, predictions=predictions)
-    wer = round(100 * wer, 2)
-    return wer
-# def run_tests (dataset_choice:str, model:str):
-#     MoDeL = Model()
-#     MoDeL.select(model)
-#     MoDeL.load()
-#     DaTaSeT = Dataset(100)
-#     DaTaSeT.load(dataset_choice)
-#     references, predictions = MoDeL.process(DaTaSeT)
-#     wer = compute_wer(references=references, predictions=predictions)
-#     return wer