w11wo commited on
Commit
60b2859
·
1 Parent(s): 7bfffb5

added tokenizers

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 44,
3
+ "<s>": 43
4
+ }
run.sh ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_speech_recognition_ctc.py \
2
+ --dataset_name="w11wo/ljspeech_phonemes" \
3
+ --text_column_name="phonemes" \
4
+ --train_split_name="train" \
5
+ --model_name_or_path="facebook/wav2vec2-base" \
6
+ --output_dir="./wav2vec2-ljspeech-gruut" \
7
+ --overwrite_output_dir \
8
+ --num_train_epochs="30" \
9
+ --per_device_train_batch_size="16" \
10
+ --gradient_accumulation_steps="2" \
11
+ --learning_rate="1e-4" \
12
+ --warmup_steps="1000" \
13
+ --weight_decay="0.005" \
14
+ --evaluation_strategy="epoch" \
15
+ --eval_metrics wer cer \
16
+ --save_strategy="epoch" \
17
+ --layerdrop="0.0" \
18
+ --save_total_limit="3" \
19
+ --freeze_feature_encoder \
20
+ --gradient_checkpointing \
21
+ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � ˈ ˌ \
22
+ --fp16 \
23
+ --group_by_length \
24
+ --report_to="tensorboard" \
25
+ --push_to_hub \
26
+ --do_train --do_eval \
27
+ --hub_model_id="bookbot/wav2vec2-ljspeech-gruut" \
28
+ --hub_private_repo="True" \
29
+ --use_auth_token="True"
run_speech_recognition_ctc.py ADDED
@@ -0,0 +1,856 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+
16
+ """ Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
17
+
18
+ import functools
19
+ import json
20
+ import logging
21
+ import os
22
+ import re
23
+ import sys
24
+ import warnings
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, List, Optional, Union
27
+
28
+ import datasets
29
+ import numpy as np
30
+ import torch
31
+ from datasets import DatasetDict, load_dataset
32
+
33
+ import evaluate
34
+ import transformers
35
+ from transformers import (
36
+ AutoConfig,
37
+ AutoFeatureExtractor,
38
+ AutoModelForCTC,
39
+ AutoProcessor,
40
+ AutoTokenizer,
41
+ HfArgumentParser,
42
+ Trainer,
43
+ TrainingArguments,
44
+ Wav2Vec2Processor,
45
+ set_seed,
46
+ )
47
+ from transformers.trainer_utils import get_last_checkpoint, is_main_process
48
+ from transformers.utils import check_min_version, send_example_telemetry
49
+ from transformers.utils.versions import require_version
50
+
51
+
52
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
53
+ check_min_version("4.26.0.dev0")
54
+
55
+ require_version(
56
+ "datasets>=1.18.0",
57
+ "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt",
58
+ )
59
+
60
+
61
+ logger = logging.getLogger(__name__)
62
+
63
+
64
+ def list_field(default=None, metadata=None):
65
+ return field(default_factory=lambda: default, metadata=metadata)
66
+
67
+
68
+ @dataclass
69
+ class ModelArguments:
70
+ """
71
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
72
+ """
73
+
74
+ model_name_or_path: str = field(
75
+ metadata={
76
+ "help": "Path to pretrained model or model identifier from huggingface.co/models"
77
+ }
78
+ )
79
+ tokenizer_name_or_path: Optional[str] = field(
80
+ default=None,
81
+ metadata={
82
+ "help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"
83
+ },
84
+ )
85
+ cache_dir: Optional[str] = field(
86
+ default=None,
87
+ metadata={
88
+ "help": "Where do you want to store the pretrained models downloaded from huggingface.co"
89
+ },
90
+ )
91
+ freeze_feature_encoder: bool = field(
92
+ default=True,
93
+ metadata={"help": "Whether to freeze the feature encoder layers of the model."},
94
+ )
95
+ attention_dropout: float = field(
96
+ default=0.0,
97
+ metadata={"help": "The dropout ratio for the attention probabilities."},
98
+ )
99
+ activation_dropout: float = field(
100
+ default=0.0,
101
+ metadata={
102
+ "help": "The dropout ratio for activations inside the fully connected layer."
103
+ },
104
+ )
105
+ feat_proj_dropout: float = field(
106
+ default=0.0, metadata={"help": "The dropout ratio for the projected features."}
107
+ )
108
+ hidden_dropout: float = field(
109
+ default=0.0,
110
+ metadata={
111
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
112
+ },
113
+ )
114
+ final_dropout: float = field(
115
+ default=0.0,
116
+ metadata={"help": "The dropout probability for the final projection layer."},
117
+ )
118
+ mask_time_prob: float = field(
119
+ default=0.05,
120
+ metadata={
121
+ "help": (
122
+ "Probability of each feature vector along the time axis to be chosen as the start of the vector"
123
+ "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
124
+ "vectors will be masked along the time axis."
125
+ )
126
+ },
127
+ )
128
+ mask_time_length: int = field(
129
+ default=10,
130
+ metadata={"help": "Length of vector span to mask along the time axis."},
131
+ )
132
+ mask_feature_prob: float = field(
133
+ default=0.0,
134
+ metadata={
135
+ "help": (
136
+ "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
137
+ " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
138
+ " bins will be masked along the time axis."
139
+ )
140
+ },
141
+ )
142
+ mask_feature_length: int = field(
143
+ default=10,
144
+ metadata={"help": "Length of vector span to mask along the feature axis."},
145
+ )
146
+ layerdrop: float = field(
147
+ default=0.0, metadata={"help": "The LayerDrop probability."}
148
+ )
149
+ ctc_loss_reduction: Optional[str] = field(
150
+ default="mean",
151
+ metadata={
152
+ "help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."
153
+ },
154
+ )
155
+
156
+
157
+ @dataclass
158
+ class DataTrainingArguments:
159
+ """
160
+ Arguments pertaining to what data we are going to input our model for training and eval.
161
+
162
+ Using `HfArgumentParser` we can turn this class
163
+ into argparse arguments to be able to specify them on
164
+ the command line.
165
+ """
166
+
167
+ dataset_name: str = field(
168
+ metadata={
169
+ "help": "The configuration name of the dataset to use (via the datasets library)."
170
+ }
171
+ )
172
+ dataset_config_name: str = field(
173
+ default=None,
174
+ metadata={
175
+ "help": "The configuration name of the dataset to use (via the datasets library)."
176
+ },
177
+ )
178
+ train_split_name: str = field(
179
+ default="train+validation",
180
+ metadata={
181
+ "help": (
182
+ "The name of the training data set split to use (via the datasets library). Defaults to "
183
+ "'train+validation'"
184
+ )
185
+ },
186
+ )
187
+ eval_split_name: str = field(
188
+ default="test",
189
+ metadata={
190
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
191
+ },
192
+ )
193
+ audio_column_name: str = field(
194
+ default="audio",
195
+ metadata={
196
+ "help": "The name of the dataset column containing the audio data. Defaults to 'audio'"
197
+ },
198
+ )
199
+ text_column_name: str = field(
200
+ default="text",
201
+ metadata={
202
+ "help": "The name of the dataset column containing the text data. Defaults to 'text'"
203
+ },
204
+ )
205
+ overwrite_cache: bool = field(
206
+ default=False,
207
+ metadata={"help": "Overwrite the cached preprocessed datasets or not."},
208
+ )
209
+ preprocessing_num_workers: Optional[int] = field(
210
+ default=None,
211
+ metadata={"help": "The number of processes to use for the preprocessing."},
212
+ )
213
+ max_train_samples: Optional[int] = field(
214
+ default=None,
215
+ metadata={
216
+ "help": (
217
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
218
+ "value if set."
219
+ )
220
+ },
221
+ )
222
+ max_eval_samples: Optional[int] = field(
223
+ default=None,
224
+ metadata={
225
+ "help": (
226
+ "For debugging purposes or quicker training, truncate the number of validation examples to this "
227
+ "value if set."
228
+ )
229
+ },
230
+ )
231
+ chars_to_ignore: Optional[List[str]] = list_field(
232
+ default=None,
233
+ metadata={"help": "A list of characters to remove from the transcripts."},
234
+ )
235
+ eval_metrics: List[str] = list_field(
236
+ default=["wer"],
237
+ metadata={
238
+ "help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"
239
+ },
240
+ )
241
+ max_duration_in_seconds: float = field(
242
+ default=20.0,
243
+ metadata={
244
+ "help": (
245
+ "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
246
+ " 'max_duration_in_seconds`"
247
+ )
248
+ },
249
+ )
250
+ min_duration_in_seconds: float = field(
251
+ default=0.0,
252
+ metadata={
253
+ "help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"
254
+ },
255
+ )
256
+ preprocessing_only: bool = field(
257
+ default=False,
258
+ metadata={
259
+ "help": (
260
+ "Whether to only do data preprocessing and skip training. This is especially useful when data"
261
+ " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
262
+ " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
263
+ " can consequently be loaded in distributed training"
264
+ )
265
+ },
266
+ )
267
+ use_auth_token: bool = field(
268
+ default=False,
269
+ metadata={
270
+ "help": (
271
+ "If :obj:`True`, will use the token generated when running"
272
+ ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
273
+ )
274
+ },
275
+ )
276
+ unk_token: str = field(
277
+ default="[UNK]",
278
+ metadata={"help": "The unk token for the tokenizer"},
279
+ )
280
+ pad_token: str = field(
281
+ default="[PAD]",
282
+ metadata={"help": "The padding token for the tokenizer"},
283
+ )
284
+ word_delimiter_token: str = field(
285
+ default="|",
286
+ metadata={"help": "The word delimiter token for the tokenizer"},
287
+ )
288
+ phoneme_language: Optional[str] = field(
289
+ default=None,
290
+ metadata={
291
+ "help": (
292
+ "The target language that should be used be"
293
+ " passed to the tokenizer for tokenization. Note that"
294
+ " this is only relevant if the model classifies the"
295
+ " input audio to a sequence of phoneme sequences."
296
+ )
297
+ },
298
+ )
299
+
300
+
301
+ @dataclass
302
+ class DataCollatorCTCWithPadding:
303
+ """
304
+ Data collator that will dynamically pad the inputs received.
305
+ Args:
306
+ processor (:class:`~transformers.AutoProcessor`)
307
+ The processor used for proccessing the data.
308
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
309
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
310
+ among:
311
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
312
+ sequence if provided).
313
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
314
+ maximum acceptable input length for the model if that argument is not provided.
315
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
316
+ different lengths).
317
+ max_length (:obj:`int`, `optional`):
318
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
319
+ max_length_labels (:obj:`int`, `optional`):
320
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
321
+ pad_to_multiple_of (:obj:`int`, `optional`):
322
+ If set will pad the sequence to a multiple of the provided value.
323
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
324
+ 7.5 (Volta).
325
+ """
326
+
327
+ processor: AutoProcessor
328
+ padding: Union[bool, str] = "longest"
329
+ pad_to_multiple_of: Optional[int] = None
330
+ pad_to_multiple_of_labels: Optional[int] = None
331
+
332
+ def __call__(
333
+ self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
334
+ ) -> Dict[str, torch.Tensor]:
335
+ # split inputs and labels since they have to be of different lenghts and need
336
+ # different padding methods
337
+ input_features = [
338
+ {"input_values": feature["input_values"]} for feature in features
339
+ ]
340
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
341
+
342
+ batch = self.processor.pad(
343
+ input_features,
344
+ padding=self.padding,
345
+ pad_to_multiple_of=self.pad_to_multiple_of,
346
+ return_tensors="pt",
347
+ )
348
+
349
+ labels_batch = self.processor.pad(
350
+ labels=label_features,
351
+ padding=self.padding,
352
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
353
+ return_tensors="pt",
354
+ )
355
+
356
+ # replace padding with -100 to ignore loss correctly
357
+ labels = labels_batch["input_ids"].masked_fill(
358
+ labels_batch.attention_mask.ne(1), -100
359
+ )
360
+
361
+ batch["labels"] = labels
362
+ if "attention_mask" in batch:
363
+ batch["attention_mask"] = batch["attention_mask"].to(torch.long)
364
+
365
+ return batch
366
+
367
+
368
+ def create_vocabulary_from_data(
369
+ datasets: DatasetDict,
370
+ word_delimiter_token: Optional[str] = None,
371
+ unk_token: Optional[str] = None,
372
+ pad_token: Optional[str] = None,
373
+ ):
374
+ # Given training and test labels create vocabulary
375
+ def extract_all_chars(batch):
376
+ all_text = " ".join(batch["target_text"])
377
+ # phonemes are split by whitespace
378
+ vocab = list(set(all_text.split())) + [" "]
379
+ return {"vocab": [vocab], "all_text": [all_text]}
380
+
381
+ vocabs = datasets.map(
382
+ extract_all_chars,
383
+ batched=True,
384
+ batch_size=-1,
385
+ keep_in_memory=True,
386
+ remove_columns=datasets["train"].column_names,
387
+ )
388
+
389
+ # take union of all unique characters in each dataset
390
+ vocab_set = functools.reduce(
391
+ lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]),
392
+ vocabs.values(),
393
+ )
394
+
395
+ vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
396
+
397
+ # replace white space with delimiter token
398
+ if word_delimiter_token is not None:
399
+ vocab_dict[word_delimiter_token] = vocab_dict[" "]
400
+ del vocab_dict[" "]
401
+
402
+ # add unk and pad token
403
+ if unk_token is not None:
404
+ vocab_dict[unk_token] = len(vocab_dict)
405
+
406
+ if pad_token is not None:
407
+ vocab_dict[pad_token] = len(vocab_dict)
408
+
409
+ return vocab_dict
410
+
411
+
412
+ def main():
413
+ # See all possible arguments in src/transformers/training_args.py
414
+ # or by passing the --help flag to this script.
415
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
416
+
417
+ parser = HfArgumentParser(
418
+ (ModelArguments, DataTrainingArguments, TrainingArguments)
419
+ )
420
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
421
+ # If we pass only one argument to the script and it's the path to a json file,
422
+ # let's parse it to get our arguments.
423
+ model_args, data_args, training_args = parser.parse_json_file(
424
+ json_file=os.path.abspath(sys.argv[1])
425
+ )
426
+ else:
427
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
428
+
429
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
430
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
431
+ send_example_telemetry("run_speech_recognition_ctc", model_args, data_args)
432
+
433
+ # Detecting last checkpoint.
434
+ last_checkpoint = None
435
+ if (
436
+ os.path.isdir(training_args.output_dir)
437
+ and training_args.do_train
438
+ and not training_args.overwrite_output_dir
439
+ ):
440
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
441
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
442
+ raise ValueError(
443
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
444
+ "Use --overwrite_output_dir to overcome."
445
+ )
446
+ elif last_checkpoint is not None:
447
+ logger.info(
448
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
449
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
450
+ )
451
+
452
+ # Setup logging
453
+ logging.basicConfig(
454
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
455
+ datefmt="%m/%d/%Y %H:%M:%S",
456
+ handlers=[logging.StreamHandler(sys.stdout)],
457
+ )
458
+ logger.setLevel(
459
+ logging.INFO if is_main_process(training_args.local_rank) else logging.WARN
460
+ )
461
+
462
+ # Log on each process the small summary:
463
+ logger.warning(
464
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
465
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
466
+ )
467
+ # Set the verbosity to info of the Transformers logger (on main process only):
468
+ if is_main_process(training_args.local_rank):
469
+ transformers.utils.logging.set_verbosity_info()
470
+ logger.info("Training/evaluation parameters %s", training_args)
471
+
472
+ # Set seed before initializing model.
473
+ set_seed(training_args.seed)
474
+
475
+ # 1. First, let's load the dataset
476
+ raw_datasets = load_dataset(
477
+ data_args.dataset_name,
478
+ data_args.dataset_config_name,
479
+ split=data_args.train_split_name,
480
+ )
481
+
482
+ raw_datasets = raw_datasets.train_test_split(test_size=0.15)
483
+
484
+ if training_args.do_train:
485
+ if data_args.audio_column_name not in raw_datasets["train"].column_names:
486
+ raise ValueError(
487
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
488
+ " Make sure to set `--audio_column_name` to the correct audio column - one of"
489
+ f" {', '.join(raw_datasets['train'].column_names)}."
490
+ )
491
+
492
+ if data_args.text_column_name not in raw_datasets["train"].column_names:
493
+ raise ValueError(
494
+ f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
495
+ "Make sure to set `--text_column_name` to the correct text column - one of "
496
+ f"{', '.join(raw_datasets['train'].column_names)}."
497
+ )
498
+
499
+ if data_args.max_train_samples is not None:
500
+ raw_datasets["train"] = raw_datasets["train"].select(
501
+ range(data_args.max_train_samples)
502
+ )
503
+
504
+ if training_args.do_eval:
505
+ if data_args.max_eval_samples is not None:
506
+ raw_datasets["test"] = raw_datasets["test"].select(
507
+ range(data_args.max_eval_samples)
508
+ )
509
+
510
+ # 2. We remove some special characters from the datasets
511
+ # that make training complicated and do not help in transcribing the speech
512
+ # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
513
+ # that could be easily picked up by the model
514
+ chars_to_ignore_regex = (
515
+ f'[{"".join(data_args.chars_to_ignore)}]'
516
+ if data_args.chars_to_ignore is not None
517
+ else None
518
+ )
519
+ text_column_name = data_args.text_column_name
520
+
521
+ def remove_special_characters(batch):
522
+ if chars_to_ignore_regex is not None:
523
+ batch["target_text"] = (
524
+ re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
525
+ )
526
+ else:
527
+ batch["target_text"] = batch[text_column_name].lower() + " "
528
+ return batch
529
+
530
+ with training_args.main_process_first(
531
+ desc="dataset map special characters removal"
532
+ ):
533
+ raw_datasets = raw_datasets.map(
534
+ remove_special_characters,
535
+ remove_columns=[text_column_name],
536
+ desc="remove special characters from datasets",
537
+ )
538
+
539
+ # save special tokens for tokenizer
540
+ word_delimiter_token = data_args.word_delimiter_token
541
+ unk_token = data_args.unk_token
542
+ pad_token = data_args.pad_token
543
+
544
+ # 3. Next, let's load the config as we might need it to create
545
+ # the tokenizer
546
+ # load config
547
+ config = AutoConfig.from_pretrained(
548
+ model_args.model_name_or_path,
549
+ cache_dir=model_args.cache_dir,
550
+ )
551
+
552
+ # 4. Next, if no tokenizer file is defined,
553
+ # we create the vocabulary of the model by extracting all unique characters from
554
+ # the training and evaluation datasets
555
+ # We need to make sure that only first rank saves vocabulary
556
+ # make sure all processes wait until vocab is created
557
+ tokenizer_name_or_path = model_args.tokenizer_name_or_path
558
+ tokenizer_kwargs = {}
559
+ if tokenizer_name_or_path is None:
560
+ # save vocab in training output dir
561
+ tokenizer_name_or_path = training_args.output_dir
562
+
563
+ vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
564
+
565
+ with training_args.main_process_first():
566
+ if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
567
+ try:
568
+ os.remove(vocab_file)
569
+ except OSError:
570
+ # in shared file-systems it might be the case that
571
+ # two processes try to delete the vocab file at the some time
572
+ pass
573
+
574
+ with training_args.main_process_first(desc="dataset map vocabulary creation"):
575
+ if not os.path.isfile(vocab_file):
576
+ os.makedirs(tokenizer_name_or_path, exist_ok=True)
577
+ vocab_dict = create_vocabulary_from_data(
578
+ raw_datasets,
579
+ word_delimiter_token=word_delimiter_token,
580
+ unk_token=unk_token,
581
+ pad_token=pad_token,
582
+ )
583
+
584
+ # save vocab dict to be loaded into tokenizer
585
+ with open(vocab_file, "w") as file:
586
+ json.dump(vocab_dict, file)
587
+
588
+ # if tokenizer has just been created
589
+ # it is defined by `tokenizer_class` if present in config else by `model_type`
590
+ tokenizer_kwargs = {
591
+ "config": config if config.tokenizer_class is not None else None,
592
+ "tokenizer_type": config.model_type
593
+ if config.tokenizer_class is None
594
+ else None,
595
+ "unk_token": unk_token,
596
+ "pad_token": pad_token,
597
+ "word_delimiter_token": word_delimiter_token,
598
+ }
599
+
600
+ # 5. Now we can instantiate the feature extractor, tokenizer and model
601
+ # Note for distributed training, the .from_pretrained methods guarantee that only
602
+ # one local process can concurrently download model & vocab.
603
+
604
+ # load feature_extractor and tokenizer
605
+ tokenizer = AutoTokenizer.from_pretrained(
606
+ tokenizer_name_or_path,
607
+ **tokenizer_kwargs,
608
+ )
609
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
610
+ model_args.model_name_or_path,
611
+ cache_dir=model_args.cache_dir,
612
+ )
613
+
614
+ # adapt config
615
+ config.update(
616
+ {
617
+ "feat_proj_dropout": model_args.feat_proj_dropout,
618
+ "attention_dropout": model_args.attention_dropout,
619
+ "hidden_dropout": model_args.hidden_dropout,
620
+ "final_dropout": model_args.final_dropout,
621
+ "mask_time_prob": model_args.mask_time_prob,
622
+ "mask_time_length": model_args.mask_time_length,
623
+ "mask_feature_prob": model_args.mask_feature_prob,
624
+ "mask_feature_length": model_args.mask_feature_length,
625
+ "gradient_checkpointing": training_args.gradient_checkpointing,
626
+ "layerdrop": model_args.layerdrop,
627
+ "ctc_loss_reduction": model_args.ctc_loss_reduction,
628
+ "pad_token_id": tokenizer.pad_token_id,
629
+ "vocab_size": len(tokenizer),
630
+ "activation_dropout": model_args.activation_dropout,
631
+ }
632
+ )
633
+
634
+ # create model
635
+ model = AutoModelForCTC.from_pretrained(
636
+ model_args.model_name_or_path,
637
+ cache_dir=model_args.cache_dir,
638
+ config=config,
639
+ )
640
+
641
+ # freeze encoder
642
+ if model_args.freeze_feature_encoder:
643
+ model.freeze_feature_encoder()
644
+
645
+ # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
646
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
647
+ # so that we just need to set the correct target sampling rate and normalize the input
648
+ # via the `feature_extractor`
649
+
650
+ # make sure that dataset decodes audio with correct sampling rate
651
+ dataset_sampling_rate = (
652
+ next(iter(raw_datasets.values()))
653
+ .features[data_args.audio_column_name]
654
+ .sampling_rate
655
+ )
656
+ if dataset_sampling_rate != feature_extractor.sampling_rate:
657
+ raw_datasets = raw_datasets.cast_column(
658
+ data_args.audio_column_name,
659
+ datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
660
+ )
661
+
662
+ # derive max & min input length for sample rate & max duration
663
+ max_input_length = (
664
+ data_args.max_duration_in_seconds * feature_extractor.sampling_rate
665
+ )
666
+ min_input_length = (
667
+ data_args.min_duration_in_seconds * feature_extractor.sampling_rate
668
+ )
669
+ audio_column_name = data_args.audio_column_name
670
+ num_workers = data_args.preprocessing_num_workers
671
+
672
+ # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
673
+ phoneme_language = data_args.phoneme_language
674
+
675
+ # Preprocessing the datasets.
676
+ # We need to read the audio files as arrays and tokenize the targets.
677
+ def prepare_dataset(batch):
678
+ # load audio
679
+ sample = batch[audio_column_name]
680
+
681
+ inputs = feature_extractor(
682
+ sample["array"], sampling_rate=sample["sampling_rate"]
683
+ )
684
+ batch["input_values"] = inputs.input_values[0]
685
+ batch["input_length"] = len(batch["input_values"])
686
+
687
+ # encode targets
688
+ additional_kwargs = {}
689
+ if phoneme_language is not None:
690
+ additional_kwargs["phonemizer_lang"] = phoneme_language
691
+
692
+ batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
693
+ return batch
694
+
695
+ with training_args.main_process_first(desc="dataset map preprocessing"):
696
+ vectorized_datasets = raw_datasets.map(
697
+ prepare_dataset,
698
+ remove_columns=next(iter(raw_datasets.values())).column_names,
699
+ num_proc=num_workers,
700
+ desc="preprocess datasets",
701
+ )
702
+
703
+ def is_audio_in_length_range(length):
704
+ return length > min_input_length and length < max_input_length
705
+
706
+ # filter data that is shorter than min_input_length
707
+ vectorized_datasets = vectorized_datasets.filter(
708
+ is_audio_in_length_range,
709
+ num_proc=num_workers,
710
+ input_columns=["input_length"],
711
+ )
712
+
713
+ # 7. Next, we can prepare the training.
714
+ # Let's use word error rate (WER) as our evaluation metric,
715
+ # instantiate a data collator and the trainer
716
+
717
+ # Define evaluation metrics during training, *i.e.* word error rate, character error rate
718
+ eval_metrics = {metric: evaluate.load(metric) for metric in data_args.eval_metrics}
719
+
720
+ # for large datasets it is advised to run the preprocessing on a
721
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
722
+ # be a timeout when running the script in distributed mode.
723
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
724
+ # cached dataset
725
+ if data_args.preprocessing_only:
726
+ logger.info(
727
+ f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}"
728
+ )
729
+ return
730
+
731
+ def compute_metrics(pred):
732
+ pred_logits = pred.predictions
733
+ pred_ids = np.argmax(pred_logits, axis=-1)
734
+
735
+ pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
736
+
737
+ pred_str = tokenizer.batch_decode(pred_ids)
738
+ # we do not want to group tokens when computing the metrics
739
+ label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
740
+
741
+ metrics = {
742
+ k: v.compute(predictions=pred_str, references=label_str)
743
+ for k, v in eval_metrics.items()
744
+ }
745
+
746
+ return metrics
747
+
748
+ # Now save everything to be able to create a single processor later
749
+ if is_main_process(training_args.local_rank):
750
+ # save feature extractor, tokenizer and config
751
+ feature_extractor.save_pretrained(training_args.output_dir)
752
+ tokenizer.save_pretrained(training_args.output_dir)
753
+ config.save_pretrained(training_args.output_dir)
754
+
755
+ try:
756
+ processor = AutoProcessor.from_pretrained(training_args.output_dir)
757
+ except (OSError, KeyError):
758
+ warnings.warn(
759
+ "Loading a processor from a feature extractor config that does not"
760
+ " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
761
+ " attribute to your `preprocessor_config.json` file to suppress this warning: "
762
+ " `'processor_class': 'Wav2Vec2Processor'`",
763
+ FutureWarning,
764
+ )
765
+ processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
766
+
767
+ # Instantiate custom data collator
768
+ data_collator = DataCollatorCTCWithPadding(processor=processor)
769
+
770
+ # Initialize Trainer
771
+ trainer = Trainer(
772
+ model=model,
773
+ data_collator=data_collator,
774
+ args=training_args,
775
+ compute_metrics=compute_metrics,
776
+ train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
777
+ eval_dataset=vectorized_datasets["test"] if training_args.do_eval else None,
778
+ tokenizer=feature_extractor,
779
+ )
780
+
781
+ # 8. Finally, we can start training
782
+
783
+ # Training
784
+ if training_args.do_train:
785
+
786
+ # use last checkpoint if exist
787
+ if last_checkpoint is not None:
788
+ checkpoint = last_checkpoint
789
+ elif os.path.isdir(model_args.model_name_or_path):
790
+ checkpoint = model_args.model_name_or_path
791
+ else:
792
+ checkpoint = None
793
+
794
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
795
+ trainer.save_model()
796
+
797
+ metrics = train_result.metrics
798
+ max_train_samples = (
799
+ data_args.max_train_samples
800
+ if data_args.max_train_samples is not None
801
+ else len(vectorized_datasets["train"])
802
+ )
803
+ metrics["train_samples"] = min(
804
+ max_train_samples, len(vectorized_datasets["train"])
805
+ )
806
+
807
+ trainer.log_metrics("train", metrics)
808
+ trainer.save_metrics("train", metrics)
809
+ trainer.save_state()
810
+
811
+ # Evaluation
812
+ results = {}
813
+ if training_args.do_eval:
814
+ logger.info("*** Evaluate ***")
815
+ metrics = trainer.evaluate()
816
+ max_eval_samples = (
817
+ data_args.max_eval_samples
818
+ if data_args.max_eval_samples is not None
819
+ else len(vectorized_datasets["test"])
820
+ )
821
+ metrics["eval_samples"] = min(
822
+ max_eval_samples, len(vectorized_datasets["test"])
823
+ )
824
+
825
+ trainer.log_metrics("test", metrics)
826
+ trainer.save_metrics("test", metrics)
827
+
828
+ # Write model card and (optionally) push to hub
829
+ config_name = (
830
+ data_args.dataset_config_name
831
+ if data_args.dataset_config_name is not None
832
+ else "na"
833
+ )
834
+ kwargs = {
835
+ "finetuned_from": model_args.model_name_or_path,
836
+ "tasks": "automatic-speech-recognition",
837
+ "tags": ["automatic-speech-recognition", data_args.dataset_name],
838
+ "dataset_args": (
839
+ f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
840
+ f" {data_args.eval_split_name}"
841
+ ),
842
+ "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
843
+ }
844
+ if "common_voice" in data_args.dataset_name:
845
+ kwargs["language"] = config_name
846
+
847
+ if training_args.push_to_hub:
848
+ trainer.push_to_hub(**kwargs)
849
+ else:
850
+ trainer.create_model_card(**kwargs)
851
+
852
+ return results
853
+
854
+
855
+ if __name__ == "__main__":
856
+ main()
special_tokens_map.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<s>",
19
+ "eos_token": "</s>",
20
+ "pad_token": "[PAD]",
21
+ "unk_token": "[UNK]"
22
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "./wav2vec2-ljspeech-gruut",
7
+ "pad_token": "[PAD]",
8
+ "replace_word_delimiter_char": " ",
9
+ "special_tokens_map_file": null,
10
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
11
+ "unk_token": "[UNK]",
12
+ "word_delimiter_token": "|"
13
+ }
vocab.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 42,
3
+ "[UNK]": 41,
4
+ "aɪ": 1,
5
+ "aʊ": 2,
6
+ "b": 3,
7
+ "d": 4,
8
+ "d͡ʒ": 5,
9
+ "eɪ": 6,
10
+ "f": 7,
11
+ "h": 8,
12
+ "i": 9,
13
+ "j": 10,
14
+ "k": 11,
15
+ "l": 12,
16
+ "m": 13,
17
+ "n": 14,
18
+ "oʊ": 15,
19
+ "p": 16,
20
+ "s": 17,
21
+ "t": 18,
22
+ "t͡ʃ": 19,
23
+ "u": 20,
24
+ "v": 21,
25
+ "w": 22,
26
+ "z": 23,
27
+ "|": 0,
28
+ "æ": 24,
29
+ "ð": 25,
30
+ "ŋ": 26,
31
+ "ɑ": 27,
32
+ "ɔ": 28,
33
+ "ɔɪ": 29,
34
+ "ə": 30,
35
+ "ɚ": 31,
36
+ "ɛ": 32,
37
+ "ɡ": 33,
38
+ "ɪ": 34,
39
+ "ɹ": 35,
40
+ "ʃ": 36,
41
+ "ʊ": 37,
42
+ "ʌ": 38,
43
+ "ʒ": 39,
44
+ "θ": 40
45
+ }