one lid for all
Browse files- README.md +77 -0
- added_tokens.json +1 -0
- config.json +163 -0
- preprocessor_config.json +21 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: en
|
3 |
+
datasets:
|
4 |
+
- librispeech_asr
|
5 |
+
- common_voice
|
6 |
+
tags:
|
7 |
+
- speech
|
8 |
+
license: apache-2.0
|
9 |
+
---
|
10 |
+
|
11 |
+
# M-CTC-T
|
12 |
+
|
13 |
+
Massively multilingual speech recognizer from Meta AI. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
|
14 |
+
|
15 |
+
![model image](https://raw.githubusercontent.com/cwkeam/scientific-images/main/MCTCT/mctct-arch.png)
|
16 |
+
|
17 |
+
The original Flashlight code, model checkpoints, and Colab notebook can be found at https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl .
|
18 |
+
|
19 |
+
|
20 |
+
## Citation
|
21 |
+
|
22 |
+
[Paper](https://arxiv.org/abs/2111.00161)
|
23 |
+
|
24 |
+
Authors: Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, Ronan Collobert
|
25 |
+
|
26 |
+
```
|
27 |
+
@article{lugosch2021pseudo,
|
28 |
+
title={Pseudo-Labeling for Massively Multilingual Speech Recognition},
|
29 |
+
author={Lugosch, Loren and Likhomanenko, Tatiana and Synnaeve, Gabriel and Collobert, Ronan},
|
30 |
+
journal={ICASSP},
|
31 |
+
year={2022}
|
32 |
+
}
|
33 |
+
```
|
34 |
+
|
35 |
+
Additional thanks to [Chan Woo Kim](https://huggingface.co/cwkeam) and [Patrick von Platen](https://huggingface.co/patrickvonplaten) for porting the model from Flashlight to PyTorch.
|
36 |
+
|
37 |
+
# Training method
|
38 |
+
|
39 |
+
![model image](https://raw.githubusercontent.com/cwkeam/scientific-images/main/MCTCT/mctct-slimipl.png) TO-DO: replace with the training diagram from paper
|
40 |
+
|
41 |
+
For more information on how the model was trained, please take a look at the [official paper](https://arxiv.org/abs/2111.00161).
|
42 |
+
|
43 |
+
# Usage
|
44 |
+
|
45 |
+
To transcribe audio files the model can be used as a standalone acoustic model as follows:
|
46 |
+
|
47 |
+
```python
|
48 |
+
import torch
|
49 |
+
import torchaudio
|
50 |
+
from datasets import load_dataset
|
51 |
+
from transformers import MCTCTForCTC, MCTCTProcessor
|
52 |
+
|
53 |
+
model = MCTCTForCTC.from_pretrained("speechbrain/mctct-large")
|
54 |
+
processor = MCTCTProcessor.from_pretrained("speechbrain/mctct-large")
|
55 |
+
|
56 |
+
# load dummy dataset and read soundfiles
|
57 |
+
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
58 |
+
|
59 |
+
# tokenize
|
60 |
+
input_features = processor(ds[0]["audio"]["array"], return_tensors="pt").input_features
|
61 |
+
|
62 |
+
# retrieve logits
|
63 |
+
logits = model(input_features).logits
|
64 |
+
|
65 |
+
# take argmax and decode
|
66 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
67 |
+
transcription = processor.batch_decode(predicted_ids)
|
68 |
+
```
|
69 |
+
|
70 |
+
Results for Common Voice, averaged over all languages:
|
71 |
+
|
72 |
+
*Character error rate (CER)*:
|
73 |
+
|
74 |
+
| Valid | Test |
|
75 |
+
|-------|------|
|
76 |
+
| 21.4 | 23.3 |
|
77 |
+
|
added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>": 8065, "</s>": 8066}
|
config.json
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"MCTCTForAudioFrameClassification",
|
4 |
+
"MCTCTForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_head_dim": 384,
|
7 |
+
"attention_probs_dropout_prob": 0.3,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"conv_channels": null,
|
10 |
+
"conv_dropout": 0.3,
|
11 |
+
"conv_glu_dim": 1,
|
12 |
+
"conv_kernel": [
|
13 |
+
7
|
14 |
+
],
|
15 |
+
"conv_stride": [
|
16 |
+
3
|
17 |
+
],
|
18 |
+
"ctc_loss_reduction": "sum",
|
19 |
+
"ctc_zero_infinity": false,
|
20 |
+
"eos_token_id": 2,
|
21 |
+
"hidden_act": "relu",
|
22 |
+
"hidden_dropout_prob": 0.3,
|
23 |
+
"hidden_size": 1536,
|
24 |
+
"id2label": {
|
25 |
+
"0": "ab",
|
26 |
+
"1": "ar",
|
27 |
+
"10": "dv",
|
28 |
+
"11": "el",
|
29 |
+
"12": "en",
|
30 |
+
"13": "eo",
|
31 |
+
"14": "es",
|
32 |
+
"15": "et",
|
33 |
+
"16": "eu",
|
34 |
+
"17": "fa",
|
35 |
+
"18": "fi",
|
36 |
+
"19": "fr",
|
37 |
+
"2": "as",
|
38 |
+
"20": "fy-NL",
|
39 |
+
"21": "ga-IE",
|
40 |
+
"22": "hi",
|
41 |
+
"23": "hsb",
|
42 |
+
"24": "hu",
|
43 |
+
"25": "ia",
|
44 |
+
"26": "id",
|
45 |
+
"27": "it",
|
46 |
+
"28": "ja",
|
47 |
+
"29": "ka",
|
48 |
+
"3": "br",
|
49 |
+
"30": "kab",
|
50 |
+
"31": "ky",
|
51 |
+
"32": "lg",
|
52 |
+
"33": "lt",
|
53 |
+
"34": "lv",
|
54 |
+
"35": "mn",
|
55 |
+
"36": "mt",
|
56 |
+
"37": "nl",
|
57 |
+
"38": "or",
|
58 |
+
"39": "pa-IN",
|
59 |
+
"4": "ca",
|
60 |
+
"40": "pl",
|
61 |
+
"41": "pt",
|
62 |
+
"42": "rm-sursilv",
|
63 |
+
"43": "rm-vallader",
|
64 |
+
"44": "ro",
|
65 |
+
"45": "ru",
|
66 |
+
"46": "rw",
|
67 |
+
"47": "sah",
|
68 |
+
"48": "sl",
|
69 |
+
"49": "sv-SE",
|
70 |
+
"5": "cnh",
|
71 |
+
"50": "ta",
|
72 |
+
"51": "th",
|
73 |
+
"52": "tr",
|
74 |
+
"53": "tt",
|
75 |
+
"54": "uk",
|
76 |
+
"55": "vi",
|
77 |
+
"56": "vot",
|
78 |
+
"57": "zh-CN",
|
79 |
+
"58": "zh-HK",
|
80 |
+
"59": "zh-TW",
|
81 |
+
"6": "cs",
|
82 |
+
"7": "cv",
|
83 |
+
"8": "cy",
|
84 |
+
"9": "de"
|
85 |
+
},
|
86 |
+
"initializer_range": 0.02,
|
87 |
+
"input_channels": 1,
|
88 |
+
"input_feat_per_channel": 80,
|
89 |
+
"intermediate_size": 6144,
|
90 |
+
"label2id": {
|
91 |
+
"ab": 0,
|
92 |
+
"ar": 1,
|
93 |
+
"as": 2,
|
94 |
+
"br": 3,
|
95 |
+
"ca": 4,
|
96 |
+
"cnh": 5,
|
97 |
+
"cs": 6,
|
98 |
+
"cv": 7,
|
99 |
+
"cy": 8,
|
100 |
+
"de": 9,
|
101 |
+
"dv": 10,
|
102 |
+
"el": 11,
|
103 |
+
"en": 12,
|
104 |
+
"eo": 13,
|
105 |
+
"es": 14,
|
106 |
+
"et": 15,
|
107 |
+
"eu": 16,
|
108 |
+
"fa": 17,
|
109 |
+
"fi": 18,
|
110 |
+
"fr": 19,
|
111 |
+
"fy-NL": 20,
|
112 |
+
"ga-IE": 21,
|
113 |
+
"hi": 22,
|
114 |
+
"hsb": 23,
|
115 |
+
"hu": 24,
|
116 |
+
"ia": 25,
|
117 |
+
"id": 26,
|
118 |
+
"it": 27,
|
119 |
+
"ja": 28,
|
120 |
+
"ka": 29,
|
121 |
+
"kab": 30,
|
122 |
+
"ky": 31,
|
123 |
+
"lg": 32,
|
124 |
+
"lt": 33,
|
125 |
+
"lv": 34,
|
126 |
+
"mn": 35,
|
127 |
+
"mt": 36,
|
128 |
+
"nl": 37,
|
129 |
+
"or": 38,
|
130 |
+
"pa-IN": 39,
|
131 |
+
"pl": 40,
|
132 |
+
"pt": 41,
|
133 |
+
"rm-sursilv": 42,
|
134 |
+
"rm-vallader": 43,
|
135 |
+
"ro": 44,
|
136 |
+
"ru": 45,
|
137 |
+
"rw": 46,
|
138 |
+
"sah": 47,
|
139 |
+
"sl": 48,
|
140 |
+
"sv-SE": 49,
|
141 |
+
"ta": 50,
|
142 |
+
"th": 51,
|
143 |
+
"tr": 52,
|
144 |
+
"tt": 53,
|
145 |
+
"uk": 54,
|
146 |
+
"vi": 55,
|
147 |
+
"vot": 56,
|
148 |
+
"zh-CN": 57,
|
149 |
+
"zh-HK": 58,
|
150 |
+
"zh-TW": 59
|
151 |
+
},
|
152 |
+
"layer_norm_eps": 1e-05,
|
153 |
+
"layerdrop": 0.3,
|
154 |
+
"max_position_embeddings": 920,
|
155 |
+
"model_type": "mctct",
|
156 |
+
"num_attention_heads": 4,
|
157 |
+
"num_conv_layers": 1,
|
158 |
+
"num_hidden_layers": 36,
|
159 |
+
"pad_token_id": 1,
|
160 |
+
"torch_dtype": "float32",
|
161 |
+
"transformers_version": "4.20.0.dev0",
|
162 |
+
"vocab_size": 8065
|
163 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"K": 257,
|
3 |
+
"do_normalize": true,
|
4 |
+
"feature_extractor_type": "MCTCFeatureExtractor",
|
5 |
+
"feature_size": 80,
|
6 |
+
"frame_signal_scale": 32768.0,
|
7 |
+
"hop_length": 10,
|
8 |
+
"mel_floor": 1.0,
|
9 |
+
"n_fft": 512,
|
10 |
+
"normalize_means": true,
|
11 |
+
"normalize_vars": true,
|
12 |
+
"padding_side": "right",
|
13 |
+
"padding_value": 0.0,
|
14 |
+
"preemphasis_coeff": 0.97,
|
15 |
+
"return_attention_mask": false,
|
16 |
+
"sample_size": 400,
|
17 |
+
"sample_stride": 160,
|
18 |
+
"sampling_rate": 16000,
|
19 |
+
"win_function": "hamming_window",
|
20 |
+
"win_length": 25
|
21 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ebe460f70052e0f0b8c1039e11b9bb0a9e2c135fb98b8e88562e5a5936d073f
|
3 |
+
size 4186831517
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "return_attention_mask": false, "do_normalize": true, "special_tokens_map_file": "./mctc-large/special_tokens_map.json", "name_or_path": "./mctc-large", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|