j-tobias commited on
Commit
d521dce
·
1 Parent(s): 09b2769

small updates and improved complexity

Browse files
__pycache__/dataset.cpython-310.pyc DELETED
Binary file (3.34 kB)
 
__pycache__/model.cpython-310.pyc DELETED
Binary file (3.74 kB)
 
__pycache__/processing.cpython-310.pyc DELETED
Binary file (4.24 kB)
 
__pycache__/utils.cpython-310.pyc DELETED
Binary file (1.04 kB)
 
app.py CHANGED
@@ -1,50 +1,34 @@
1
  import gradio as gr
2
  from processing import run
3
-
4
- # from utils import hf_login
5
- # hf_login()
6
-
7
  from huggingface_hub import login
8
  import os
9
 
 
 
 
 
10
  hf_token = os.getenv("HF_Token")
11
  login(hf_token)
12
 
 
 
 
 
 
 
13
 
14
- MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr"]
15
- DATASET_OPTIONS = ["Common Voice", "VoxPopuli", "OWN Recoding/Sample"]
16
 
17
 
18
- # def eval(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str)->str:
19
 
20
- # print("OWN AUDIO: ", type(own_audio), own_audio)
21
 
22
- # wer_result_1, wer_result_2, references, transcriptions1, transcriptions2 = run(data_subset, model_1, model_2, own_audio, own_transcription)
23
 
24
- # results_md = f"""#### {model_1}
25
- # - WER Score: {wer_result_1}
26
-
27
- # #### {model_2}
28
- # - WER Score: {wer_result_2}"""
29
-
30
- # # Create the bar plot
31
- # fig = go.Figure(
32
- # data=[
33
- # go.Bar(x=[f"{model_1}"], y=[wer_result_1]),
34
- # go.Bar(x=[f"{model_2}"], y=[wer_result_2]),
35
- # ]
36
- # )
37
-
38
- # # Update the layout for better visualization
39
- # fig.update_layout(
40
- # title="Comparison of Two Models",
41
- # xaxis_title="Models",
42
- # yaxis_title="Value",
43
- # barmode="group",
44
- # )
45
-
46
- # return results_md, fig
47
 
 
48
  def get_card(selected_model:str)->str:
49
 
50
  with open("cards.txt", "r") as f:
@@ -63,6 +47,14 @@ def is_own(selected_option):
63
  else:
64
  return gr.update(visible=False), gr.update(visible=False)
65
 
 
 
 
 
 
 
 
 
66
  with gr.Blocks() as demo:
67
 
68
 
@@ -126,7 +118,14 @@ Happy experimenting and comparing! 🚀""")
126
 
127
  gr.Markdown('## <p style="text-align: center;">Results</p>')
128
  results_md = gr.Markdown("")
129
- results_plot = gr.Plot(show_label=False)
130
- eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot])
 
 
 
 
 
 
 
131
 
132
  demo.launch(debug=True)
 
1
  import gradio as gr
2
  from processing import run
3
+ import json
 
 
 
4
  from huggingface_hub import login
5
  import os
6
 
7
+
8
+
9
+
10
+ # LOG INTO HUGGING FACE
11
  hf_token = os.getenv("HF_Token")
12
  login(hf_token)
13
 
14
+ def hf_login():
15
+ hf_token = os.getenv("HF_Token")
16
+ if hf_token is None:
17
+ with open("credentials.json", "r") as f:
18
+ hf_token = json.load(f)["token"]
19
+ login(token=hf_token, add_to_git_credential=True)
20
 
21
+ # hf_login()
 
22
 
23
 
 
24
 
 
25
 
 
26
 
27
+ # GENERAL OPTIONS FOR MODELS AND DATASETS
28
+ MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr"]
29
+ DATASET_OPTIONS = ["Common Voice", "OWN Recoding/Sample"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # HELPER FUNCTIONS
32
  def get_card(selected_model:str)->str:
33
 
34
  with open("cards.txt", "r") as f:
 
47
  else:
48
  return gr.update(visible=False), gr.update(visible=False)
49
 
50
+ def make_visible():
51
+ return gr.update(visible=True), gr.update(visible=True)
52
+
53
+
54
+
55
+
56
+
57
+ # THE ACTUAL APP
58
  with gr.Blocks() as demo:
59
 
60
 
 
118
 
119
  gr.Markdown('## <p style="text-align: center;">Results</p>')
120
  results_md = gr.Markdown("")
121
+ results_plot = gr.Plot(show_label=False, visible=False)
122
+ results_df = gr.DataFrame(
123
+ visible=False,
124
+ row_count=(5, "dynamic"), # Allow dynamic rows
125
+ interactive=False, # Allow users to interact with the DataFrame
126
+ wrap=True, # Ensure text wraps to multiple lines
127
+ )
128
+ eval_btn.click(make_visible, outputs=[results_plot, results_df])
129
+ eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
130
 
131
  demo.launch(debug=True)
processing.py CHANGED
@@ -3,10 +3,12 @@ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcess
3
  import plotly.graph_objs as go
4
  from datasets import load_dataset
5
  from datasets import Audio
6
- from transformers import pipeline
7
  import evaluate
8
  import librosa
9
  import numpy as np
 
 
 
10
 
11
  wer_metric = evaluate.load("wer")
12
 
@@ -33,11 +35,12 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
33
  dataset, text_column = load_Common_Voice()
34
  print("Dataset Loaded")
35
 
36
- # check if models are the same
37
  model1, processor1 = load_model(model_1)
38
  model2, processor2 = load_model(model_2)
39
  print("Models Loaded")
40
 
 
 
41
  if data_subset == "OWN Recoding/Sample":
42
  sample = {"audio":{"array":audio,"sampling_rate":16000}}
43
  transcription1 = model_compute(model1, processor1, sample, model_1)
@@ -46,11 +49,12 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
46
  transcriptions1 = [transcription1]
47
  transcriptions2 = [transcription2]
48
  references = [own_transcription]
49
-
50
  wer1 = compute_wer(references, transcriptions1)
51
  wer2 = compute_wer(references, transcriptions2)
52
 
53
- results_md = f"""#### {model_1}
 
54
  - WER Score: {wer1}
55
 
56
  #### {model_2}
@@ -59,8 +63,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
59
  # Create the bar plot
60
  fig = go.Figure(
61
  data=[
62
- go.Bar(x=[f"{model_1}"], y=[wer1]),
63
- go.Bar(x=[f"{model_2}"], y=[wer2]),
64
  ]
65
  )
66
  # Update the layout for better visualization
@@ -71,14 +75,20 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
71
  barmode="group",
72
  )
73
 
74
- yield results_md, fig
 
 
75
 
76
  else:
77
  references = []
78
  transcriptions1 = []
79
  transcriptions2 = []
 
 
 
 
80
  counter = 0
81
- for sample in dataset:
82
  print(counter)
83
  counter += 1
84
 
@@ -89,25 +99,30 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
89
 
90
  transcriptions1.append(transcription)
91
  transcriptions2.append(transcription)
92
- else:
93
- transcriptions1.append(model_compute(model1, processor1, sample, model_1))
94
- transcriptions2.append(model_compute(model2, processor2, sample, model_2))
 
 
95
 
 
 
96
 
97
- wer1 = compute_wer(references, transcriptions1)
98
- wer2 = compute_wer(references, transcriptions2)
99
 
100
- results_md = f"""#### {model_1}
101
- - WER Score: {wer1}
 
 
 
102
 
103
  #### {model_2}
104
- - WER Score: {wer2}"""
105
 
106
  # Create the bar plot
107
  fig = go.Figure(
108
  data=[
109
- go.Bar(x=[f"{model_1}"], y=[wer1]),
110
- go.Bar(x=[f"{model_2}"], y=[wer2]),
111
  ]
112
  )
113
 
@@ -119,32 +134,62 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
119
  barmode="group",
120
  )
121
 
122
- yield results_md, fig
123
-
124
-
125
 
 
126
 
 
127
 
128
 
129
  # DATASET LOADERS
130
  def load_Common_Voice():
131
  dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
132
  text_column = "sentence"
133
- dataset = dataset.take(100)
134
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
135
  dataset = list(dataset)
136
  return dataset, text_column
137
 
138
  def load_Vox_Populi():
139
- dataset = dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
 
 
 
140
  print(next(iter(dataset)))
141
- text_column = "raw_text"
142
- dataset = dataset.take(100)
 
 
 
 
 
 
 
 
 
 
143
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 
 
144
  dataset = list(dataset)
145
  return dataset, text_column
146
 
147
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
 
150
  # MODEL LOADERS
@@ -154,7 +199,7 @@ def load_model(model_id:str):
154
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
155
  elif model_id == "facebook/s2t-medium-librispeech-asr":
156
  model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
157
- processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
158
  else:
159
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
160
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -178,7 +223,7 @@ def model_compute(model, processor, sample, model_id):
178
  attention_mask = features.attention_mask
179
  gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
180
  transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
181
- return transcription[0]
182
 
183
  else:
184
  return model(sample)
@@ -186,7 +231,7 @@ def model_compute(model, processor, sample, model_id):
186
  # UTILS
187
  def compute_wer(references, predictions):
188
  wer = wer_metric.compute(references=references, predictions=predictions)
189
- wer = round(100 * wer, 2)
190
  return wer
191
 
192
 
 
3
  import plotly.graph_objs as go
4
  from datasets import load_dataset
5
  from datasets import Audio
 
6
  import evaluate
7
  import librosa
8
  import numpy as np
9
+ import pandas as pd
10
+
11
+ N_SAMPLES = 30
12
 
13
  wer_metric = evaluate.load("wer")
14
 
 
35
  dataset, text_column = load_Common_Voice()
36
  print("Dataset Loaded")
37
 
 
38
  model1, processor1 = load_model(model_1)
39
  model2, processor2 = load_model(model_2)
40
  print("Models Loaded")
41
 
42
+
43
+
44
  if data_subset == "OWN Recoding/Sample":
45
  sample = {"audio":{"array":audio,"sampling_rate":16000}}
46
  transcription1 = model_compute(model1, processor1, sample, model_1)
 
49
  transcriptions1 = [transcription1]
50
  transcriptions2 = [transcription2]
51
  references = [own_transcription]
52
+
53
  wer1 = compute_wer(references, transcriptions1)
54
  wer2 = compute_wer(references, transcriptions2)
55
 
56
+ results_md = f"""
57
+ #### {model_1}
58
  - WER Score: {wer1}
59
 
60
  #### {model_2}
 
63
  # Create the bar plot
64
  fig = go.Figure(
65
  data=[
66
+ go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
67
+ go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
68
  ]
69
  )
70
  # Update the layout for better visualization
 
75
  barmode="group",
76
  )
77
 
78
+ df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":[wer1],"transcriptions 2":transcriptions2,"WER 2":[wer2]})
79
+
80
+ yield results_md, fig, df
81
 
82
  else:
83
  references = []
84
  transcriptions1 = []
85
  transcriptions2 = []
86
+ WER1s = []
87
+ WER2s = []
88
+
89
+
90
  counter = 0
91
+ for i, sample in enumerate(dataset, start=1):
92
  print(counter)
93
  counter += 1
94
 
 
99
 
100
  transcriptions1.append(transcription)
101
  transcriptions2.append(transcription)
102
+ else:
103
+ transcription1 = model_compute(model1, processor1, sample, model_1)
104
+ transcription2 = model_compute(model2, processor2, sample, model_2)
105
+ transcriptions1.append(transcription1)
106
+ transcriptions2.append(transcription2)
107
 
108
+ WER1s.append(compute_wer([sample[text_column]], [transcription1]))
109
+ WER2s.append(compute_wer([sample[text_column]], [transcription2]))
110
 
 
 
111
 
112
+ results_md = f"""
113
+ {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
114
+
115
+ #### {model_1}
116
+ - WER Score: {sum(WER1s)/N_SAMPLES}
117
 
118
  #### {model_2}
119
+ - WER Score: {sum(WER2s)/N_SAMPLES}"""
120
 
121
  # Create the bar plot
122
  fig = go.Figure(
123
  data=[
124
+ go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/N_SAMPLES], showlegend=False),
125
+ go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/N_SAMPLES], showlegend=False),
126
  ]
127
  )
128
 
 
134
  barmode="group",
135
  )
136
 
137
+ df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s})
 
 
138
 
139
+ yield results_md, fig, df
140
 
141
+
142
 
143
 
144
  # DATASET LOADERS
145
  def load_Common_Voice():
146
  dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
147
  text_column = "sentence"
148
+ dataset = dataset.take(N_SAMPLES)
149
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
150
  dataset = list(dataset)
151
  return dataset, text_column
152
 
153
  def load_Vox_Populi():
154
+ # Load the dataset in streaming mode
155
+ dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
156
+
157
+ # Optionally, preview the first item to understand the structure (can be removed in production)
158
  print(next(iter(dataset)))
159
+
160
+ # Take the first 120 examples to work with
161
+ dataset = dataset.take(N_SAMPLES+20)
162
+ text_column = "normalized_text"
163
+
164
+ # Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
165
+ dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
166
+
167
+ # Take the first 100 examples after filtering
168
+ dataset = dataset.take(N_SAMPLES)
169
+
170
+ # Cast the 'audio' column to the desired sampling rate
171
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
172
+
173
+ # Convert to list and return
174
  dataset = list(dataset)
175
  return dataset, text_column
176
 
177
+ def is_valid_sample(text, audio):
178
+ # Check if 'normalized_text' is valid
179
+ text = text.strip()
180
+ if text == "" or text == "ignore time segment in scoring":
181
+ return False
182
+
183
+ # Check if the 'audio' array is valid (not empty and meets length criteria)
184
+ if len(audio['array']) == 0: # Audio is empty
185
+ return False
186
+
187
+ # Optionally, check if the audio duration is within a certain range
188
+ duration = audio['array'].size / audio['sampling_rate']
189
+ if duration < 1.0 or duration > 60.0: # Example: Filter out audio shorter than 1 second or longer than 60 seconds
190
+ return False
191
+
192
+ return True
193
 
194
 
195
  # MODEL LOADERS
 
199
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
200
  elif model_id == "facebook/s2t-medium-librispeech-asr":
201
  model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
202
+ processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
203
  else:
204
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
205
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
 
223
  attention_mask = features.attention_mask
224
  gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
225
  transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
226
+ return transcription
227
 
228
  else:
229
  return model(sample)
 
231
  # UTILS
232
  def compute_wer(references, predictions):
233
  wer = wer_metric.compute(references=references, predictions=predictions)
234
+ wer = round(N_SAMPLES * wer, 2)
235
  return wer
236
 
237
 
utils.py DELETED
@@ -1,35 +0,0 @@
1
- from huggingface_hub import login
2
- import json
3
- import evaluate
4
- import os
5
-
6
- def hf_login():
7
- hf_token = os.getenv("HF_Token")
8
- print(hf_token)
9
- if hf_token is None:
10
- with open("credentials.json", "r") as f:
11
- hf_token = json.load(f)["token"]
12
- login(token=hf_token, add_to_git_credential=True)
13
-
14
- def data(dataset):
15
- for i, item in enumerate(dataset):
16
- yield {**item["audio"], "reference": item["norm_text"]}
17
-
18
- def compute_wer(references, predictions):
19
- wer_metric = evaluate.load("wer")
20
- wer = wer_metric.compute(references=references, predictions=predictions)
21
- wer = round(100 * wer, 2)
22
- return wer
23
-
24
-
25
- # def run_tests (dataset_choice:str, model:str):
26
-
27
- # MoDeL = Model()
28
- # MoDeL.select(model)
29
- # MoDeL.load()
30
- # DaTaSeT = Dataset(100)
31
- # DaTaSeT.load(dataset_choice)
32
- # references, predictions = MoDeL.process(DaTaSeT)
33
- # wer = compute_wer(references=references, predictions=predictions)
34
- # return wer
35
-