corrected tokenizer chars
Browse files- config.json +4 -4
- run.sh +1 -0
- run_speech_recognition_ctc.py +1 -1
- tokenizer_config.json +1 -1
- vocab.json +1 -1
config.json
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
"add_adapter": false,
|
7 |
"apply_spec_augment": true,
|
8 |
"architectures": [
|
9 |
-
"
|
10 |
],
|
11 |
"attention_dropout": 0.0,
|
12 |
"bos_token_id": 1,
|
@@ -61,10 +61,10 @@
|
|
61 |
"layerdrop": 0.0,
|
62 |
"mask_feature_length": 10,
|
63 |
"mask_feature_min_masks": 0,
|
64 |
-
"mask_feature_prob": 0.
|
65 |
"mask_time_length": 10,
|
66 |
"mask_time_min_masks": 2,
|
67 |
-
"mask_time_prob": 0.
|
68 |
"model_type": "wav2vec2",
|
69 |
"num_adapter_layers": 3,
|
70 |
"num_attention_heads": 16,
|
@@ -100,7 +100,7 @@
|
|
100 |
1
|
101 |
],
|
102 |
"torch_dtype": "float32",
|
103 |
-
"transformers_version": "4.
|
104 |
"use_weighted_layer_sum": false,
|
105 |
"vocab_size": 218,
|
106 |
"xvector_output_dim": 512
|
|
|
6 |
"add_adapter": false,
|
7 |
"apply_spec_augment": true,
|
8 |
"architectures": [
|
9 |
+
"Wav2Vec2ForPreTraining"
|
10 |
],
|
11 |
"attention_dropout": 0.0,
|
12 |
"bos_token_id": 1,
|
|
|
61 |
"layerdrop": 0.0,
|
62 |
"mask_feature_length": 10,
|
63 |
"mask_feature_min_masks": 0,
|
64 |
+
"mask_feature_prob": 0.33,
|
65 |
"mask_time_length": 10,
|
66 |
"mask_time_min_masks": 2,
|
67 |
+
"mask_time_prob": 0.05,
|
68 |
"model_type": "wav2vec2",
|
69 |
"num_adapter_layers": 3,
|
70 |
"num_attention_heads": 16,
|
|
|
100 |
1
|
101 |
],
|
102 |
"torch_dtype": "float32",
|
103 |
+
"transformers_version": "4.17.0.dev0",
|
104 |
"use_weighted_layer_sum": false,
|
105 |
"vocab_size": 218,
|
106 |
"xvector_output_dim": 512
|
run.sh
CHANGED
@@ -4,6 +4,7 @@ python run_speech_recognition_ctc.py \
|
|
4 |
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
|
5 |
--dataset_config_name="fr" \
|
6 |
--output_dir="./" \
|
|
|
7 |
--overwrite_output_dir \
|
8 |
--num_train_epochs="5" \
|
9 |
--per_device_train_batch_size="64" \
|
|
|
4 |
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
|
5 |
--dataset_config_name="fr" \
|
6 |
--output_dir="./" \
|
7 |
+
--tokenizer_name_or_path="./" \
|
8 |
--overwrite_output_dir \
|
9 |
--num_train_epochs="5" \
|
10 |
--per_device_train_batch_size="64" \
|
run_speech_recognition_ctc.py
CHANGED
@@ -643,7 +643,7 @@ def main():
|
|
643 |
|
644 |
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
|
645 |
|
646 |
-
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
|
647 |
# we do not want to group tokens when computing the metrics
|
648 |
label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
|
649 |
|
|
|
643 |
|
644 |
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
|
645 |
|
646 |
+
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)#being sure to remove <s> from the output
|
647 |
# we do not want to group tokens when computing the metrics
|
648 |
label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
|
649 |
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "[UNK]", "bos_token":
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"'": 1, "(": 2, ")": 3, "*": 4, ".": 5, "/": 6, "1": 7, "2": 8, "=": 9, "C": 10, "E": 11, "N": 12, "Q": 13, "R": 14, "Z": 15, "`": 16, "a": 17, "b": 18, "c": 19, "d": 20, "e": 21, "f": 22, "g": 23, "h": 24, "i": 25, "j": 26, "k": 27, "l": 28, "m": 29, "n": 30, "o": 31, "p": 32, "q": 33, "r": 34, "s": 35, "t": 36, "u": 37, "v": 38, "w": 39, "x": 40, "y": 41, "z": 42, "{": 43, "|": 0, "}": 45, "~": 46, "
|
|
|
1 |
+
{"'": 1, "(": 2, ")": 3, "*": 4, ".": 5, "/": 6, "1": 7, "2": 8, "=": 9, "C": 10, "E": 11, "N": 12, "Q": 13, "R": 14, "Z": 15, "`": 16, "a": 17, "b": 18, "c": 19, "d": 20, "e": 21, "f": 22, "g": 23, "h": 24, "i": 25, "j": 26, "k": 27, "l": 28, "m": 29, "n": 30, "o": 31, "p": 32, "q": 33, "r": 34, "s": 35, "t": 36, "u": 37, "v": 38, "w": 39, "x": 40, "y": 41, "z": 42, "{": 43, "|": 0, "}": 45, "~": 46, "\u00a7": 47, "\u00ab": 48, "\u00ae": 49, "\u00b0": 50, "\u00b1": 51, "\u00b7": 52, "\u00bb": 53, "\u00d7": 54, "\u00df": 55, "\u00e6": 56, "\u00e7": 57, "\u00f0": 58, "\u00f8": 59, "\u00fe": 60, "\u0111": 61, "\u0127": 62, "\u0131": 63, "\u0142": 64, "\u0153": 65, "\u01c0": 66, "\u01c3": 67, "\u0251": 68, "\u0259": 69, "\u0268": 70, "\u0289": 71, "\u0294": 72, "\u02bb": 73, "\u02bc": 74, "\u02bd": 75, "\u02be": 76, "\u02bf": 77, "\u02d0": 78, "\u03b1": 79, "\u03b2": 80, "\u03b3": 81, "\u03b4": 82, "\u03b5": 83, "\u03b6": 84, "\u03b7": 85, "\u03b8": 86, "\u03b9": 87, "\u03ba": 88, "\u03bb": 89, "\u03bc": 90, "\u03bd": 91, "\u03bf": 92, "\u03c0": 93, "\u03c1": 94, "\u03c2": 95, "\u03c3": 96, "\u03c4": 97, "\u03c5": 98, "\u03c6": 99, "\u03c7": 100, "\u03c8": 101, "\u03c9": 102, "\u0430": 103, "\u0433": 104, "\u0435": 105, "\u0437": 106, "\u0438": 107, "\u043a": 108, "\u043c": 109, "\u043d": 110, "\u043e": 111, "\u043f": 112, "\u0440": 113, "\u0446": 114, "\u0447": 115, "\u044d": 116, "\u044f": 117, "\u0454": 118, "\u0456": 119, "\u0458": 120, "\u045f": 121, "\u04ab": 122, "\u04cc": 123, "\u0563": 124, "\u0566": 125, "\u0627": 126, "\u0628": 127, "\u0629": 128, "\u062f": 129, "\u0631": 130, "\u0644": 131, "\u0645": 132, "\u0646": 133, "\u0648": 134, "\u064a": 135, "\u1100": 136, "\u1106": 137, "\u1109": 138, "\u110c": 139, "\u1161": 140, "\u1162": 141, "\u1165": 142, "\u1169": 143, "\u1175": 144, "\u11a8": 145, "\u11b7": 146, "\u11b8": 147, "\u11bc": 148, "\u1240": 149, "\u12a8": 150, "\u12c8": 151, "\u12f0": 152, "\u1300": 153, "\u2010": 154, "\u2013": 155, "\u2014": 156, "\u2015": 157, "\u2019": 158, "\u201e": 159, "\u2020": 160, "\u2032": 161, "\u2039": 162, "\u203a": 163, "\u2044": 164, "\u20bd": 165, "\u2192": 166, "\u2194": 167, "\u2205": 168, "\u2206": 169, "\u2208": 170, "\u2212": 171, "\u221e": 172, "\u2228": 173, "\u223c": 174, "\u2265": 175, "\u22a8": 176, "\u22c5": 177, "\u2500": 178, "\u2609": 179, "\u2c45": 180, "\u2c4e": 181, "\u3044": 182, "\u3046": 183, "\u305f": 184, "\u3064": 185, "\u306e": 186, "\u3072": 187, "\u3078": 188, "\u307e": 189, "\u3080": 190, "\u3081": 191, "\u3082": 192, "\u3084": 193, "\u4e09": 194, "\u4e39": 195, "\u4e43": 196, "\u4eac": 197, "\u4fdd": 198, "\u5317": 199, "\u53b3": 200, "\u5b87": 201, "\u626c": 202, "\u6587": 203, "\u661f": 204, "\u672f": 205, "\u675c": 206, "\u6d25": 207, "\u7261": 208, "\u750c": 209, "\u7f8e": 210, "\u897f": 211, "\u8cb4": 212, "\u9752": 213, "\u9986": 214, "\ua751": 215, "[UNK]": 215, "[PAD]": 216}
|