new training

Browse files

Files changed (7) hide show

added_tokens.json +1 -1
alphabet.json +0 -1
config.json +4 -5
run.sh +1 -2
run_speech_recognition_ctc.py +48 -23
special_tokens_map.json +1 -1
vocab.json +1 -0

added_tokens.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"<s>": 35, "</s>": 36}


1	+ {"<s>": 33, "</s>": 34}

alphabet.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"labels": [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e4", "\u00e5", "\u00e9", "\u00f4", "\u00f6", "\u00fc", "\u2047", "", "<s>", "</s>"], "is_bpe": false}

config.json CHANGED Viewed

@@ -6,7 +6,7 @@
   "add_adapter": false,
   "apply_spec_augment": true,
   "architectures": [
-    "Wav2Vec2ForCTC"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 1,
@@ -84,7 +84,7 @@
   "num_hidden_layers": 24,
   "num_negatives": 100,
   "output_hidden_size": 1024,
-  "pad_token_id": 34,
   "proj_codevector_dim": 768,
   "tdnn_dilation": [
     1,
@@ -107,9 +107,8 @@
     1,
     1
   ],
-  "torch_dtype": "float32",
-  "transformers_version": "4.16.0.dev0",
   "use_weighted_layer_sum": false,
-  "vocab_size": 37,
   "xvector_output_dim": 512
 }

   "add_adapter": false,
   "apply_spec_augment": true,
   "architectures": [
+    "Wav2Vec2ForPreTraining"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "num_hidden_layers": 24,
   "num_negatives": 100,
   "output_hidden_size": 1024,
+  "pad_token_id": 32,
   "proj_codevector_dim": 768,
   "tdnn_dilation": [
     1,
     1,
     1
   ],
+  "transformers_version": "4.17.0.dev0",
   "use_weighted_layer_sum": false,
+  "vocab_size": 35,
   "xvector_output_dim": 512
 }

run.sh CHANGED Viewed

@@ -4,7 +4,6 @@ python run_speech_recognition_ctc.py \
 	--dataset_config_name="sv-SE,distant_channel" \
 	--train_split_name="train+validation,train" \
 	--eval_split_name="test,None" \
-	--preprocessing_only \
 	--output_dir="./" \
 	--overwrite_output_dir \
 	--num_train_epochs="5" \
@@ -17,7 +16,7 @@ python run_speech_recognition_ctc.py \
 	--evaluation_strategy="epoch" \
 	--save_strategy="epoch" \
 	--text_column_name="sentence" \
-	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – / \
 	--logging_steps="100" \
 	--layerdrop="0.0" \
 	--activation_dropout="0.1" \

 	--dataset_config_name="sv-SE,distant_channel" \
 	--train_split_name="train+validation,train" \
 	--eval_split_name="test,None" \
 	--output_dir="./" \
 	--overwrite_output_dir \
 	--num_train_epochs="5" \
 	--evaluation_strategy="epoch" \
 	--save_strategy="epoch" \
 	--text_column_name="sentence" \
+	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
 	--logging_steps="100" \
 	--layerdrop="0.0" \
 	--activation_dropout="0.1" \

run_speech_recognition_ctc.py CHANGED Viewed

@@ -321,25 +321,20 @@ def create_vocabulary_from_data(
     pad_token: Optional[str] = None,
 ):
     # Given training and test labels create vocabulary
-    def extract_all_chars(batch):
-        all_text = " ".join(batch["target_text"])
-        vocab = list(set(all_text))
-        return {"vocab": [vocab], "all_text": [all_text]}
-    vocabs = datasets.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=10000,
-        keep_in_memory=False,
-        remove_columns=[col for col in datasets["train"].column_names if col in datasets["eval"].column_names],
-    )
-    # take union of all unique characters in each dataset
-    vocab_set = functools.reduce(
-        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
-    )
-    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
     # replace white space with delimiter token
     if word_delimiter_token is not None:
@@ -458,7 +453,7 @@ def main():
                     )
                     min_columns_train = common_cols(min_columns_train, new_dataset.column_names)
             else:
-                logging.warning(f"{dataset_name} {dataset_config_name} as split is {train_split_name}")
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
             raise ValueError(
@@ -512,7 +507,7 @@ def main():
                     )
                     min_columns_eval = common_cols(min_columns_eval, new_dataset.column_names)
             else:
-                logging.warning(f"{dataset_name} {dataset_config_name} as split is {eval_split_name}")
         if data_args.max_eval_samples is not None:
             raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
@@ -536,9 +531,32 @@ def main():
     def remove_special_characters(batch):
         if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
         else:
-            batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
         return batch
     num_workers = data_args.preprocessing_num_workers
@@ -694,9 +712,16 @@ def main():
         return batch
     with training_args.main_process_first(desc="dataset map preprocessing"):
-        vectorized_datasets = raw_datasets.map(
             prepare_dataset,
-            remove_columns=next(iter(raw_datasets.values())).column_names,
             num_proc=num_workers,
             desc="preprocess datasets",
         )

     pad_token: Optional[str] = None,
 ):
     # Given training and test labels create vocabulary
+    def extract_all_chars(batch, vocab):
+        all_text = " ".join(batch)
+        return list(set(list(set(all_text)) + vocab))
+    batch_size = 10000
+    vocab = []
+    for i in range(0, datasets["train"].num_rows, 10000):
+        batch = datasets["train"].select(range(i, min(datasets["train"].num_rows, i+batch_size)))
+        vocab = extract_all_chars(batch["target_text"], vocab)
+    for i in range(0, datasets["eval"].num_rows, 10000):
+        batch = datasets["eval"].select(range(i, min(datasets["eval"].num_rows, i+batch_size)))
+        vocab = extract_all_chars(batch["target_text"], vocab)
+    vocab_dict = {v: k for k, v in enumerate(sorted(vocab))}
     # replace white space with delimiter token
     if word_delimiter_token is not None:
                     )
                     min_columns_train = common_cols(min_columns_train, new_dataset.column_names)
             else:
+                logging.warning(f"{dataset_name} {dataset_config_name} train not loaded as split is {train_split_name}")
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
             raise ValueError(
                     )
                     min_columns_eval = common_cols(min_columns_eval, new_dataset.column_names)
             else:
+                logging.warning(f"{dataset_name} {dataset_config_name} eval not loaded as split is {eval_split_name}")
         if data_args.max_eval_samples is not None:
             raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
     def remove_special_characters(batch):
         if chars_to_ignore_regex is not None:
+            batch["target_text"] = \
+                re.sub(chars_to_ignore_regex, "", batch[text_column_name]) \
+                    .replace("\\\\Punkt", "") \
+                    .replace("\\\\Komma", "") \
+                    .replace("è", "e") \
+                    .replace("é", "e") \
+                    .replace("î", "i") \
+                    .replace("ü", "u") \
+                    .replace("ÿ", "y") \
+                    .replace("\\", "") \
+                    .replace("/", "") \
+                    .replace("|", "") \
+                    .lower() + " "
         else:
+            batch["target_text"] = batch[text_column_name] \
+                .replace("\\\\Punkt", "") \
+                .replace("\\\\Komma", "") \
+                .replace("è", "e") \
+                .replace("é", "e") \
+                .replace("î", "i") \
+                .replace("ü", "u") \
+                .replace("ÿ", "y") \
+                .replace("\\", "") \
+                .replace("/", "") \
+                .replace("|", "") \
+                .lower() + " "
         return batch
     num_workers = data_args.preprocessing_num_workers
         return batch
     with training_args.main_process_first(desc="dataset map preprocessing"):
+        vectorized_datasets = DatasetDict()
+        vectorized_datasets["train"] = raw_datasets["train"].map(
+            prepare_dataset,
+            remove_columns=raw_datasets["train"].column_names,
+            num_proc=num_workers,
+            desc="preprocess datasets",
+        )
+        vectorized_datasets["eval"] = raw_datasets["eval"].map(
             prepare_dataset,
+            remove_columns=raw_datasets["eval"].column_names,
             num_proc=num_workers,
             desc="preprocess datasets",
         )

special_tokens_map.json CHANGED Viewed

@@ -1 +1 @@

- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "ä": 27, "å": 28, "ô": 29, "ö": 30, "\|": 0, "[UNK]": 31, "[PAD]": 32}