Lasion commited on
Commit
10f6e8c
1 Parent(s): fdc02ec

Upload tokenizer

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. added_tokens.json +2 -2
  3. special_tokens_map.json +2 -2
  4. tokenizer_config.json +8 -8
  5. vocab.json +109 -91
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  license: apache-2.0
3
- base_model: facebook/wav2vec2-xls-r-300m
4
  tags:
5
  - generated_from_trainer
 
6
  metrics:
7
  - wer
8
  model-index:
 
1
  ---
2
  license: apache-2.0
 
3
  tags:
4
  - generated_from_trainer
5
+ base_model: facebook/wav2vec2-xls-r-300m
6
  metrics:
7
  - wer
8
  model-index:
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 93,
3
- "<s>": 92
4
  }
 
1
  {
2
+ "</s>": 111,
3
+ "<s>": 110
4
  }
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
- "pad_token": "[PAD]",
5
- "unk_token": "[UNK]"
6
  }
 
1
  {
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
  }
tokenizer_config.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
  "added_tokens_decoder": {
3
- "90": {
4
- "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
7
  "rstrip": true,
8
  "single_word": false,
9
  "special": false
10
  },
11
- "91": {
12
- "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
15
  "rstrip": true,
16
  "single_word": false,
17
  "special": false
18
  },
19
- "92": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "93": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
@@ -38,10 +38,10 @@
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
  "model_max_length": 1000000000000000019884624838656,
41
- "pad_token": "[PAD]",
42
  "replace_word_delimiter_char": " ",
43
  "target_lang": null,
44
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
- "unk_token": "[UNK]",
46
  "word_delimiter_token": "|"
47
  }
 
1
  {
2
  "added_tokens_decoder": {
3
+ "108": {
4
+ "content": "<unk>",
5
  "lstrip": true,
6
  "normalized": false,
7
  "rstrip": true,
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "109": {
12
+ "content": "<pad>",
13
  "lstrip": true,
14
  "normalized": false,
15
  "rstrip": true,
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "110": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "111": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
 
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
  "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
  "replace_word_delimiter_char": " ",
43
  "target_lang": null,
44
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
+ "unk_token": "<unk>",
46
  "word_delimiter_token": "|"
47
  }
vocab.json CHANGED
@@ -1,94 +1,112 @@
1
  {
2
- "[PAD]": 91,
3
- "[UNK]": 90,
4
- "a": 80,
5
- "b": 26,
6
- "c": 78,
7
- "d": 47,
8
- "e": 41,
9
- "g": 37,
10
- "h": 64,
11
- "i": 85,
12
- "k": 5,
13
- "l": 38,
14
- "m": 43,
15
- "n": 81,
16
- "o": 14,
17
- "p": 21,
18
- "q": 16,
19
- "r": 55,
20
- "s": 10,
21
- "t": 40,
22
- "u": 9,
23
- "v": 25,
24
- "x": 69,
25
- "y": 62,
26
- "|": 58,
27
- "à": 23,
28
- "á": 7,
29
- "â": 42,
30
- "ã": 59,
31
- "è": 6,
32
- "é": 63,
33
- "ê": 36,
34
- "ì": 28,
35
- "í": 89,
36
- "ò": 3,
37
- "ó": 34,
38
- "ô": 75,
39
- "õ": 35,
40
- "ù": 27,
41
- "ú": 39,
42
- "ý": 19,
43
- "ă": 86,
44
- "đ": 45,
45
- "ĩ": 18,
46
- "ũ": 54,
47
- "ơ": 68,
48
- "ư": 77,
49
- "": 84,
50
- "": 56,
51
- "": 71,
52
- "": 15,
53
- "": 0,
54
- "": 72,
55
- "": 87,
56
- "": 82,
57
- "": 61,
58
- "": 1,
59
- "": 33,
60
- "": 88,
61
- "": 70,
62
- "": 20,
63
- "": 76,
64
- "ế": 2,
65
- "": 44,
66
- "": 46,
67
- "": 31,
68
- "": 29,
69
- "": 22,
70
- "": 12,
71
- "": 50,
72
- "": 4,
73
- "": 57,
74
- "": 13,
75
- "": 24,
76
- "": 48,
77
- "": 32,
78
- "": 51,
79
- "": 83,
80
- "": 17,
81
- "": 79,
82
- "": 52,
83
- "": 67,
84
- "": 53,
85
- "": 30,
86
- "": 65,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  "ử": 60,
88
- "ữ": 66,
89
- "ự": 11,
90
- "ỳ": 8,
91
- "ỵ": 73,
92
- "ỷ": 49,
93
- "ỹ": 74
94
  }
 
1
  {
2
+ "0": 63,
3
+ "1": 94,
4
+ "2": 55,
5
+ "3": 4,
6
+ "4": 34,
7
+ "5": 107,
8
+ "6": 1,
9
+ "7": 11,
10
+ "8": 43,
11
+ "9": 51,
12
+ "<pad>": 109,
13
+ "<unk>": 108,
14
+ "a": 17,
15
+ "b": 36,
16
+ "c": 49,
17
+ "d": 44,
18
+ "e": 65,
19
+ "f": 77,
20
+ "g": 58,
21
+ "h": 71,
22
+ "i": 57,
23
+ "j": 22,
24
+ "k": 96,
25
+ "l": 18,
26
+ "m": 67,
27
+ "n": 26,
28
+ "o": 105,
29
+ "p": 98,
30
+ "q": 20,
31
+ "r": 47,
32
+ "s": 39,
33
+ "t": 56,
34
+ "u": 31,
35
+ "v": 15,
36
+ "w": 35,
37
+ "x": 81,
38
+ "y": 32,
39
+ "z": 87,
40
+ "|": 46,
41
+ "à": 24,
42
+ "á": 62,
43
+ "â": 72,
44
+ "ã": 83,
45
+ "è": 90,
46
+ "é": 27,
47
+ "ê": 12,
48
+ "ì": 40,
49
+ "í": 3,
50
+ "ò": 92,
51
+ "ó": 82,
52
+ "ô": 30,
53
+ "õ": 10,
54
+ "ù": 53,
55
+ "ú": 73,
56
+ "ý": 6,
57
+ "ă": 104,
58
+ "đ": 89,
59
+ "ĩ": 70,
60
+ "ũ": 48,
61
+ "ơ": 95,
62
+ "ư": 106,
63
+ "̀": 61,
64
+ "́": 59,
65
+ "̣": 86,
66
+ "у": 29,
67
+ "": 50,
68
+ "": 88,
69
+ "": 99,
70
+ "": 41,
71
+ "": 7,
72
+ "": 97,
73
+ "": 64,
74
+ "": 79,
75
+ "": 33,
76
+ "": 68,
77
+ "": 93,
78
+ "": 76,
79
+ "": 103,
80
+ "": 0,
81
+ "": 100,
82
+ "ế": 52,
83
+ "": 9,
84
+ "": 45,
85
+ "": 38,
86
+ "": 37,
87
+ "ỉ": 101,
88
+ "ị": 85,
89
+ "ọ": 74,
90
+ "ỏ": 14,
91
+ "ố": 23,
92
+ "ồ": 75,
93
+ "ổ": 84,
94
+ "ỗ": 25,
95
+ "ộ": 66,
96
+ "ớ": 102,
97
+ "ờ": 21,
98
+ "ở": 8,
99
+ "ỡ": 54,
100
+ "ợ": 69,
101
+ "ụ": 2,
102
+ "ủ": 28,
103
+ "ứ": 13,
104
+ "ừ": 91,
105
  "ử": 60,
106
+ "ữ": 78,
107
+ "ự": 19,
108
+ "ỳ": 80,
109
+ "ỵ": 42,
110
+ "ỷ": 16,
111
+ "ỹ": 5
112
  }