Moreno La Quatra commited on
Commit
bdf75cc
1 Parent(s): d2a221b

Create eval.py

Browse files
Files changed (1) hide show
  1. eval.py +244 -0
eval.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import re
4
+ from typing import Dict
5
+ from sklearn import feature_extraction
6
+
7
+ import torch
8
+ from src.data.normalization import normalize_string
9
+ from datasets import Audio, Dataset, load_dataset, load_metric
10
+
11
+ from transformers import (
12
+ AutoFeatureExtractor,
13
+ pipeline,
14
+ AutoTokenizer,
15
+ Wav2Vec2Processor,
16
+ Wav2Vec2ProcessorWithLM,
17
+ Wav2Vec2ForCTC,
18
+ AutoConfig,
19
+ )
20
+
21
+
22
+ def log_results(result: Dataset, args: Dict[str, str]):
23
+ """DO NOT CHANGE. This function computes and logs the result metrics."""
24
+
25
+ log_outputs = args.log_outputs
26
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
27
+
28
+ # load metric
29
+ wer = load_metric("wer")
30
+ cer = load_metric("cer")
31
+
32
+ # compute metrics
33
+ wer_result = wer.compute(
34
+ references=result["target"], predictions=result["prediction"]
35
+ )
36
+ cer_result = cer.compute(
37
+ references=result["target"], predictions=result["prediction"]
38
+ )
39
+
40
+ # print & log results
41
+ result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
42
+ print(result_str)
43
+
44
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
45
+ f.write(result_str)
46
+
47
+ # log all results in text file. Possibly interesting for analysis
48
+ if log_outputs is not None:
49
+ pred_file = f"log_{dataset_id}_predictions.txt"
50
+ target_file = f"log_{dataset_id}_targets.txt"
51
+
52
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
53
+
54
+ # mapping function to write output
55
+ def write_to_file(batch, i):
56
+ p.write(f"{i}" + "\n")
57
+ p.write(batch["prediction"] + "\n")
58
+ t.write(f"{i}" + "\n")
59
+ t.write(batch["target"] + "\n")
60
+
61
+ result.map(write_to_file, with_indices=True)
62
+
63
+
64
+ def normalize_text(text: str, invalid_chars_regex: str, to_lower: bool) -> str:
65
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
66
+ text = normalize_string(text)
67
+ text = text.lower() if to_lower else text.upper()
68
+
69
+ text = re.sub(invalid_chars_regex, " ", text)
70
+ text = re.sub("\s+", " ", text).strip()
71
+
72
+ return text
73
+
74
+
75
+ def main(args):
76
+ # load dataset
77
+ dataset = load_dataset(
78
+ args.dataset, args.config, split=args.split, use_auth_token=True
79
+ )
80
+
81
+ # for testing: only process the first two examples as a test
82
+ # dataset = dataset.select(range(10))
83
+
84
+ # load processor
85
+ # feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
86
+ # sampling_rate = feature_extractor.sampling_rate
87
+
88
+ if args.ctcdecode:
89
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
90
+ decoder = processor.decoder
91
+ else:
92
+ processor = Wav2Vec2Processor.from_pretrained(args.model_id)
93
+ decoder = None
94
+
95
+ feature_extractor = processor.feature_extractor
96
+ tokenizer = processor.tokenizer
97
+ sampling_rate = feature_extractor.sampling_rate
98
+
99
+ config = AutoConfig.from_pretrained(args.model_id)
100
+ model = Wav2Vec2ForCTC.from_pretrained(args.model_id)
101
+
102
+ # resample audio
103
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
104
+
105
+ # load eval pipeline
106
+ if args.device is None:
107
+ args.device = 0 if torch.cuda.is_available() else -1
108
+
109
+ asr = pipeline(
110
+ "automatic-speech-recognition",
111
+ model=model,
112
+ config=config,
113
+ feature_extractor=feature_extractor,
114
+ decoder=decoder,
115
+ tokenizer=tokenizer,
116
+ device=args.device,
117
+ )
118
+
119
+ # build normalizer config
120
+ tokenizer = AutoTokenizer.from_pretrained(args.model_id)
121
+ tokens = [
122
+ x for x in tokenizer.convert_ids_to_tokens(range(0, tokenizer.vocab_size))
123
+ ]
124
+ special_tokens = [
125
+ tokenizer.pad_token,
126
+ tokenizer.word_delimiter_token,
127
+ tokenizer.unk_token,
128
+ tokenizer.bos_token,
129
+ tokenizer.eos_token,
130
+ ]
131
+ non_special_tokens = [x for x in tokens if x not in special_tokens]
132
+ invalid_chars_regex = f"[^\s{re.escape(''.join(set(non_special_tokens)))}]"
133
+ normalize_to_lower = False
134
+ for token in non_special_tokens:
135
+ if token.isalpha() and token.islower():
136
+ normalize_to_lower = True
137
+ break
138
+
139
+ # map function to decode audio
140
+ def map_to_pred(
141
+ batch,
142
+ args=args,
143
+ asr=asr,
144
+ invalid_chars_regex=invalid_chars_regex,
145
+ normalize_to_lower=normalize_to_lower,
146
+ ):
147
+ prediction = asr(
148
+ batch["audio"]["array"],
149
+ chunk_length_s=args.chunk_length_s,
150
+ stride_length_s=args.stride_length_s,
151
+ #decoder_kwargs={"beam_width": args.beam_width},
152
+ )
153
+
154
+ batch["prediction"] = prediction["text"]
155
+ batch["target"] = normalize_text(
156
+ batch["sentence"], invalid_chars_regex, normalize_to_lower
157
+ )
158
+ return batch
159
+
160
+ def map_and_decode(batch):
161
+ inputs = processor(
162
+ batch["audio"]["array"],
163
+ sampling_rate=batch["audio"]["sampling_rate"],
164
+ return_tensors="pt",
165
+ )
166
+ with torch.no_grad():
167
+ logits = model(**inputs).logits
168
+ transcription = processor.batch_decode(logits.numpy()).text
169
+ batch["prediction"] = transcription
170
+ batch["target"] = normalize_text(
171
+ batch["sentence"], invalid_chars_regex, normalize_to_lower
172
+ )
173
+ return batch
174
+
175
+ # transcription = .lower()
176
+ # run inference on all examples
177
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
178
+
179
+ # compute and log_results
180
+ # do not change function below
181
+ log_results(result, args)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ parser = argparse.ArgumentParser()
186
+
187
+ parser.add_argument(
188
+ "--model_id",
189
+ type=str,
190
+ required=True,
191
+ help="Model identifier. Should be loadable with 🤗 Transformers",
192
+ )
193
+ parser.add_argument(
194
+ "--dataset",
195
+ type=str,
196
+ required=True,
197
+ help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
198
+ )
199
+ parser.add_argument(
200
+ "--config",
201
+ type=str,
202
+ required=True,
203
+ help="Config of the dataset. *E.g.* `'en'` for Common Voice",
204
+ )
205
+ parser.add_argument(
206
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
207
+ )
208
+ parser.add_argument(
209
+ "--chunk_length_s",
210
+ type=float,
211
+ default=None,
212
+ help="Chunk length in seconds. Defaults to 5 seconds.",
213
+ )
214
+ parser.add_argument(
215
+ "--stride_length_s",
216
+ type=float,
217
+ default=None,
218
+ help="Stride of the audio chunks. Defaults to 1 second.",
219
+ )
220
+ parser.add_argument(
221
+ "--log_outputs",
222
+ action="store_true",
223
+ help="If defined, write outputs to log file for analysis.",
224
+ )
225
+ parser.add_argument(
226
+ "--ctcdecode",
227
+ action="store_true",
228
+ help="Apply the ctc decoder to the output (only if present in the model card).",
229
+ )
230
+ parser.add_argument(
231
+ "--device",
232
+ type=int,
233
+ default=None,
234
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
235
+ )
236
+ parser.add_argument(
237
+ "--beam_width",
238
+ type=int,
239
+ default=1,
240
+ help="Beam width used by the pyctc decoder.",
241
+ )
242
+ args = parser.parse_args()
243
+
244
+ main(args)