Spaces:
Sleeping
Sleeping
j-tobias
commited on
Commit
·
d521dce
1
Parent(s):
09b2769
small updates and improved complexity
Browse files- __pycache__/dataset.cpython-310.pyc +0 -0
- __pycache__/model.cpython-310.pyc +0 -0
- __pycache__/processing.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +33 -34
- processing.py +74 -29
- utils.py +0 -35
__pycache__/dataset.cpython-310.pyc
DELETED
Binary file (3.34 kB)
|
|
__pycache__/model.cpython-310.pyc
DELETED
Binary file (3.74 kB)
|
|
__pycache__/processing.cpython-310.pyc
DELETED
Binary file (4.24 kB)
|
|
__pycache__/utils.cpython-310.pyc
DELETED
Binary file (1.04 kB)
|
|
app.py
CHANGED
@@ -1,50 +1,34 @@
|
|
1 |
import gradio as gr
|
2 |
from processing import run
|
3 |
-
|
4 |
-
# from utils import hf_login
|
5 |
-
# hf_login()
|
6 |
-
|
7 |
from huggingface_hub import login
|
8 |
import os
|
9 |
|
|
|
|
|
|
|
|
|
10 |
hf_token = os.getenv("HF_Token")
|
11 |
login(hf_token)
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
DATASET_OPTIONS = ["Common Voice", "VoxPopuli", "OWN Recoding/Sample"]
|
16 |
|
17 |
|
18 |
-
# def eval(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str)->str:
|
19 |
|
20 |
-
# print("OWN AUDIO: ", type(own_audio), own_audio)
|
21 |
|
22 |
-
# wer_result_1, wer_result_2, references, transcriptions1, transcriptions2 = run(data_subset, model_1, model_2, own_audio, own_transcription)
|
23 |
|
24 |
-
#
|
25 |
-
|
26 |
-
|
27 |
-
# #### {model_2}
|
28 |
-
# - WER Score: {wer_result_2}"""
|
29 |
-
|
30 |
-
# # Create the bar plot
|
31 |
-
# fig = go.Figure(
|
32 |
-
# data=[
|
33 |
-
# go.Bar(x=[f"{model_1}"], y=[wer_result_1]),
|
34 |
-
# go.Bar(x=[f"{model_2}"], y=[wer_result_2]),
|
35 |
-
# ]
|
36 |
-
# )
|
37 |
-
|
38 |
-
# # Update the layout for better visualization
|
39 |
-
# fig.update_layout(
|
40 |
-
# title="Comparison of Two Models",
|
41 |
-
# xaxis_title="Models",
|
42 |
-
# yaxis_title="Value",
|
43 |
-
# barmode="group",
|
44 |
-
# )
|
45 |
-
|
46 |
-
# return results_md, fig
|
47 |
|
|
|
48 |
def get_card(selected_model:str)->str:
|
49 |
|
50 |
with open("cards.txt", "r") as f:
|
@@ -63,6 +47,14 @@ def is_own(selected_option):
|
|
63 |
else:
|
64 |
return gr.update(visible=False), gr.update(visible=False)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
with gr.Blocks() as demo:
|
67 |
|
68 |
|
@@ -126,7 +118,14 @@ Happy experimenting and comparing! 🚀""")
|
|
126 |
|
127 |
gr.Markdown('## <p style="text-align: center;">Results</p>')
|
128 |
results_md = gr.Markdown("")
|
129 |
-
results_plot = gr.Plot(show_label=False)
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
demo.launch(debug=True)
|
|
|
1 |
import gradio as gr
|
2 |
from processing import run
|
3 |
+
import json
|
|
|
|
|
|
|
4 |
from huggingface_hub import login
|
5 |
import os
|
6 |
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
# LOG INTO HUGGING FACE
|
11 |
hf_token = os.getenv("HF_Token")
|
12 |
login(hf_token)
|
13 |
|
14 |
+
def hf_login():
|
15 |
+
hf_token = os.getenv("HF_Token")
|
16 |
+
if hf_token is None:
|
17 |
+
with open("credentials.json", "r") as f:
|
18 |
+
hf_token = json.load(f)["token"]
|
19 |
+
login(token=hf_token, add_to_git_credential=True)
|
20 |
|
21 |
+
# hf_login()
|
|
|
22 |
|
23 |
|
|
|
24 |
|
|
|
25 |
|
|
|
26 |
|
27 |
+
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
28 |
+
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr"]
|
29 |
+
DATASET_OPTIONS = ["Common Voice", "OWN Recoding/Sample"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
# HELPER FUNCTIONS
|
32 |
def get_card(selected_model:str)->str:
|
33 |
|
34 |
with open("cards.txt", "r") as f:
|
|
|
47 |
else:
|
48 |
return gr.update(visible=False), gr.update(visible=False)
|
49 |
|
50 |
+
def make_visible():
|
51 |
+
return gr.update(visible=True), gr.update(visible=True)
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
# THE ACTUAL APP
|
58 |
with gr.Blocks() as demo:
|
59 |
|
60 |
|
|
|
118 |
|
119 |
gr.Markdown('## <p style="text-align: center;">Results</p>')
|
120 |
results_md = gr.Markdown("")
|
121 |
+
results_plot = gr.Plot(show_label=False, visible=False)
|
122 |
+
results_df = gr.DataFrame(
|
123 |
+
visible=False,
|
124 |
+
row_count=(5, "dynamic"), # Allow dynamic rows
|
125 |
+
interactive=False, # Allow users to interact with the DataFrame
|
126 |
+
wrap=True, # Ensure text wraps to multiple lines
|
127 |
+
)
|
128 |
+
eval_btn.click(make_visible, outputs=[results_plot, results_df])
|
129 |
+
eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
|
130 |
|
131 |
demo.launch(debug=True)
|
processing.py
CHANGED
@@ -3,10 +3,12 @@ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcess
|
|
3 |
import plotly.graph_objs as go
|
4 |
from datasets import load_dataset
|
5 |
from datasets import Audio
|
6 |
-
from transformers import pipeline
|
7 |
import evaluate
|
8 |
import librosa
|
9 |
import numpy as np
|
|
|
|
|
|
|
10 |
|
11 |
wer_metric = evaluate.load("wer")
|
12 |
|
@@ -33,11 +35,12 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
33 |
dataset, text_column = load_Common_Voice()
|
34 |
print("Dataset Loaded")
|
35 |
|
36 |
-
# check if models are the same
|
37 |
model1, processor1 = load_model(model_1)
|
38 |
model2, processor2 = load_model(model_2)
|
39 |
print("Models Loaded")
|
40 |
|
|
|
|
|
41 |
if data_subset == "OWN Recoding/Sample":
|
42 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
43 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
@@ -46,11 +49,12 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
46 |
transcriptions1 = [transcription1]
|
47 |
transcriptions2 = [transcription2]
|
48 |
references = [own_transcription]
|
49 |
-
|
50 |
wer1 = compute_wer(references, transcriptions1)
|
51 |
wer2 = compute_wer(references, transcriptions2)
|
52 |
|
53 |
-
results_md = f"""
|
|
|
54 |
- WER Score: {wer1}
|
55 |
|
56 |
#### {model_2}
|
@@ -59,8 +63,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
59 |
# Create the bar plot
|
60 |
fig = go.Figure(
|
61 |
data=[
|
62 |
-
go.Bar(x=[f"{model_1}"], y=[wer1]),
|
63 |
-
go.Bar(x=[f"{model_2}"], y=[wer2]),
|
64 |
]
|
65 |
)
|
66 |
# Update the layout for better visualization
|
@@ -71,14 +75,20 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
71 |
barmode="group",
|
72 |
)
|
73 |
|
74 |
-
|
|
|
|
|
75 |
|
76 |
else:
|
77 |
references = []
|
78 |
transcriptions1 = []
|
79 |
transcriptions2 = []
|
|
|
|
|
|
|
|
|
80 |
counter = 0
|
81 |
-
for sample in dataset:
|
82 |
print(counter)
|
83 |
counter += 1
|
84 |
|
@@ -89,25 +99,30 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
89 |
|
90 |
transcriptions1.append(transcription)
|
91 |
transcriptions2.append(transcription)
|
92 |
-
else:
|
93 |
-
|
94 |
-
|
|
|
|
|
95 |
|
|
|
|
|
96 |
|
97 |
-
wer1 = compute_wer(references, transcriptions1)
|
98 |
-
wer2 = compute_wer(references, transcriptions2)
|
99 |
|
100 |
-
results_md = f"""
|
101 |
-
-
|
|
|
|
|
|
|
102 |
|
103 |
#### {model_2}
|
104 |
-
- WER Score: {
|
105 |
|
106 |
# Create the bar plot
|
107 |
fig = go.Figure(
|
108 |
data=[
|
109 |
-
go.Bar(x=[f"{model_1}"], y=[
|
110 |
-
go.Bar(x=[f"{model_2}"], y=[
|
111 |
]
|
112 |
)
|
113 |
|
@@ -119,32 +134,62 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
119 |
barmode="group",
|
120 |
)
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
|
|
|
126 |
|
|
|
127 |
|
128 |
|
129 |
# DATASET LOADERS
|
130 |
def load_Common_Voice():
|
131 |
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
|
132 |
text_column = "sentence"
|
133 |
-
dataset = dataset.take(
|
134 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
135 |
dataset = list(dataset)
|
136 |
return dataset, text_column
|
137 |
|
138 |
def load_Vox_Populi():
|
139 |
-
|
|
|
|
|
|
|
140 |
print(next(iter(dataset)))
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
|
|
|
|
144 |
dataset = list(dataset)
|
145 |
return dataset, text_column
|
146 |
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
|
150 |
# MODEL LOADERS
|
@@ -154,7 +199,7 @@ def load_model(model_id:str):
|
|
154 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
155 |
elif model_id == "facebook/s2t-medium-librispeech-asr":
|
156 |
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
|
157 |
-
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr"
|
158 |
else:
|
159 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
160 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
@@ -178,7 +223,7 @@ def model_compute(model, processor, sample, model_id):
|
|
178 |
attention_mask = features.attention_mask
|
179 |
gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
|
180 |
transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
|
181 |
-
return transcription
|
182 |
|
183 |
else:
|
184 |
return model(sample)
|
@@ -186,7 +231,7 @@ def model_compute(model, processor, sample, model_id):
|
|
186 |
# UTILS
|
187 |
def compute_wer(references, predictions):
|
188 |
wer = wer_metric.compute(references=references, predictions=predictions)
|
189 |
-
wer = round(
|
190 |
return wer
|
191 |
|
192 |
|
|
|
3 |
import plotly.graph_objs as go
|
4 |
from datasets import load_dataset
|
5 |
from datasets import Audio
|
|
|
6 |
import evaluate
|
7 |
import librosa
|
8 |
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
N_SAMPLES = 30
|
12 |
|
13 |
wer_metric = evaluate.load("wer")
|
14 |
|
|
|
35 |
dataset, text_column = load_Common_Voice()
|
36 |
print("Dataset Loaded")
|
37 |
|
|
|
38 |
model1, processor1 = load_model(model_1)
|
39 |
model2, processor2 = load_model(model_2)
|
40 |
print("Models Loaded")
|
41 |
|
42 |
+
|
43 |
+
|
44 |
if data_subset == "OWN Recoding/Sample":
|
45 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
46 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
|
|
49 |
transcriptions1 = [transcription1]
|
50 |
transcriptions2 = [transcription2]
|
51 |
references = [own_transcription]
|
52 |
+
|
53 |
wer1 = compute_wer(references, transcriptions1)
|
54 |
wer2 = compute_wer(references, transcriptions2)
|
55 |
|
56 |
+
results_md = f"""
|
57 |
+
#### {model_1}
|
58 |
- WER Score: {wer1}
|
59 |
|
60 |
#### {model_2}
|
|
|
63 |
# Create the bar plot
|
64 |
fig = go.Figure(
|
65 |
data=[
|
66 |
+
go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
|
67 |
+
go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
|
68 |
]
|
69 |
)
|
70 |
# Update the layout for better visualization
|
|
|
75 |
barmode="group",
|
76 |
)
|
77 |
|
78 |
+
df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":[wer1],"transcriptions 2":transcriptions2,"WER 2":[wer2]})
|
79 |
+
|
80 |
+
yield results_md, fig, df
|
81 |
|
82 |
else:
|
83 |
references = []
|
84 |
transcriptions1 = []
|
85 |
transcriptions2 = []
|
86 |
+
WER1s = []
|
87 |
+
WER2s = []
|
88 |
+
|
89 |
+
|
90 |
counter = 0
|
91 |
+
for i, sample in enumerate(dataset, start=1):
|
92 |
print(counter)
|
93 |
counter += 1
|
94 |
|
|
|
99 |
|
100 |
transcriptions1.append(transcription)
|
101 |
transcriptions2.append(transcription)
|
102 |
+
else:
|
103 |
+
transcription1 = model_compute(model1, processor1, sample, model_1)
|
104 |
+
transcription2 = model_compute(model2, processor2, sample, model_2)
|
105 |
+
transcriptions1.append(transcription1)
|
106 |
+
transcriptions2.append(transcription2)
|
107 |
|
108 |
+
WER1s.append(compute_wer([sample[text_column]], [transcription1]))
|
109 |
+
WER2s.append(compute_wer([sample[text_column]], [transcription2]))
|
110 |
|
|
|
|
|
111 |
|
112 |
+
results_md = f"""
|
113 |
+
{i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
|
114 |
+
|
115 |
+
#### {model_1}
|
116 |
+
- WER Score: {sum(WER1s)/N_SAMPLES}
|
117 |
|
118 |
#### {model_2}
|
119 |
+
- WER Score: {sum(WER2s)/N_SAMPLES}"""
|
120 |
|
121 |
# Create the bar plot
|
122 |
fig = go.Figure(
|
123 |
data=[
|
124 |
+
go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/N_SAMPLES], showlegend=False),
|
125 |
+
go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/N_SAMPLES], showlegend=False),
|
126 |
]
|
127 |
)
|
128 |
|
|
|
134 |
barmode="group",
|
135 |
)
|
136 |
|
137 |
+
df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s})
|
|
|
|
|
138 |
|
139 |
+
yield results_md, fig, df
|
140 |
|
141 |
+
|
142 |
|
143 |
|
144 |
# DATASET LOADERS
|
145 |
def load_Common_Voice():
|
146 |
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
|
147 |
text_column = "sentence"
|
148 |
+
dataset = dataset.take(N_SAMPLES)
|
149 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
150 |
dataset = list(dataset)
|
151 |
return dataset, text_column
|
152 |
|
153 |
def load_Vox_Populi():
|
154 |
+
# Load the dataset in streaming mode
|
155 |
+
dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
|
156 |
+
|
157 |
+
# Optionally, preview the first item to understand the structure (can be removed in production)
|
158 |
print(next(iter(dataset)))
|
159 |
+
|
160 |
+
# Take the first 120 examples to work with
|
161 |
+
dataset = dataset.take(N_SAMPLES+20)
|
162 |
+
text_column = "normalized_text"
|
163 |
+
|
164 |
+
# Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
|
165 |
+
dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
|
166 |
+
|
167 |
+
# Take the first 100 examples after filtering
|
168 |
+
dataset = dataset.take(N_SAMPLES)
|
169 |
+
|
170 |
+
# Cast the 'audio' column to the desired sampling rate
|
171 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
172 |
+
|
173 |
+
# Convert to list and return
|
174 |
dataset = list(dataset)
|
175 |
return dataset, text_column
|
176 |
|
177 |
+
def is_valid_sample(text, audio):
|
178 |
+
# Check if 'normalized_text' is valid
|
179 |
+
text = text.strip()
|
180 |
+
if text == "" or text == "ignore time segment in scoring":
|
181 |
+
return False
|
182 |
+
|
183 |
+
# Check if the 'audio' array is valid (not empty and meets length criteria)
|
184 |
+
if len(audio['array']) == 0: # Audio is empty
|
185 |
+
return False
|
186 |
+
|
187 |
+
# Optionally, check if the audio duration is within a certain range
|
188 |
+
duration = audio['array'].size / audio['sampling_rate']
|
189 |
+
if duration < 1.0 or duration > 60.0: # Example: Filter out audio shorter than 1 second or longer than 60 seconds
|
190 |
+
return False
|
191 |
+
|
192 |
+
return True
|
193 |
|
194 |
|
195 |
# MODEL LOADERS
|
|
|
199 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
200 |
elif model_id == "facebook/s2t-medium-librispeech-asr":
|
201 |
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
|
202 |
+
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
|
203 |
else:
|
204 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
205 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
|
|
223 |
attention_mask = features.attention_mask
|
224 |
gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
|
225 |
transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
|
226 |
+
return transcription
|
227 |
|
228 |
else:
|
229 |
return model(sample)
|
|
|
231 |
# UTILS
|
232 |
def compute_wer(references, predictions):
|
233 |
wer = wer_metric.compute(references=references, predictions=predictions)
|
234 |
+
wer = round(N_SAMPLES * wer, 2)
|
235 |
return wer
|
236 |
|
237 |
|
utils.py
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
from huggingface_hub import login
|
2 |
-
import json
|
3 |
-
import evaluate
|
4 |
-
import os
|
5 |
-
|
6 |
-
def hf_login():
|
7 |
-
hf_token = os.getenv("HF_Token")
|
8 |
-
print(hf_token)
|
9 |
-
if hf_token is None:
|
10 |
-
with open("credentials.json", "r") as f:
|
11 |
-
hf_token = json.load(f)["token"]
|
12 |
-
login(token=hf_token, add_to_git_credential=True)
|
13 |
-
|
14 |
-
def data(dataset):
|
15 |
-
for i, item in enumerate(dataset):
|
16 |
-
yield {**item["audio"], "reference": item["norm_text"]}
|
17 |
-
|
18 |
-
def compute_wer(references, predictions):
|
19 |
-
wer_metric = evaluate.load("wer")
|
20 |
-
wer = wer_metric.compute(references=references, predictions=predictions)
|
21 |
-
wer = round(100 * wer, 2)
|
22 |
-
return wer
|
23 |
-
|
24 |
-
|
25 |
-
# def run_tests (dataset_choice:str, model:str):
|
26 |
-
|
27 |
-
# MoDeL = Model()
|
28 |
-
# MoDeL.select(model)
|
29 |
-
# MoDeL.load()
|
30 |
-
# DaTaSeT = Dataset(100)
|
31 |
-
# DaTaSeT.load(dataset_choice)
|
32 |
-
# references, predictions = MoDeL.process(DaTaSeT)
|
33 |
-
# wer = compute_wer(references=references, predictions=predictions)
|
34 |
-
# return wer
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|