codebyzeb commited on
Commit
7531c0f
1 Parent(s): d852007

Upload tokenizer

Browse files
Files changed (3) hide show
  1. tokenizer.json +247 -203
  2. tokenizer_config.json +1 -1
  3. vocab.json +1 -1
tokenizer.json CHANGED
@@ -22,7 +22,7 @@
22
  "special": true
23
  },
24
  {
25
- "id": 5,
26
  "content": "UTT_BOUNDARY",
27
  "single_word": false,
28
  "lstrip": false,
@@ -34,13 +34,6 @@
34
  "normalizer": {
35
  "type": "Sequence",
36
  "normalizers": [
37
- {
38
- "type": "Replace",
39
- "pattern": {
40
- "String": "\n"
41
- },
42
- "content": " UTT_BOUNDARY"
43
- },
44
  {
45
  "type": "Strip",
46
  "strip_left": true,
@@ -51,207 +44,258 @@
51
  "pre_tokenizer": {
52
  "type": "Whitespace"
53
  },
54
- "post_processor": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  "decoder": null,
56
  "model": {
57
  "type": "WordLevel",
58
  "vocab": {
59
  "UNK": 0,
60
  "PAD": 1,
61
- "BOS": 2,
62
- "EOS": 3,
63
- "WORD_BOUNDARY": 4,
64
- "UTT_BOUNDARY": 5,
65
- "s": 6,
66
- "": 7,
67
- "ð": 8,
68
- "ɛ": 9,
69
- "ɹ": 10,
70
- "z": 11,
71
- "ʌ": 12,
72
- "f": 13,
73
- "": 14,
74
- "w": 15,
75
- "ɪ": 16,
76
- "ɡ": 17,
77
- "l": 18,
78
- "æ": 19,
79
- "ɑ": 20,
80
- "h": 21,
81
- "ə": 22,
82
- "ʊ": 23,
83
- "k": 24,
84
- "p": 25,
85
- "": 26,
86
- "b": 27,
87
- "i": 28,
88
- "t": 29,
89
- "": 30,
90
- "θ": 31,
91
- "ŋ": 32,
92
- "j": 33,
93
- "ɔ": 34,
94
- "m": 35,
95
- "ɔɪ": 36,
96
- "n": 37,
97
- "d": 38,
98
- "": 39,
99
- "": 40,
100
- "v": 41,
101
- "ɜː": 42,
102
- "t̠ʃ": 43,
103
- "d̠ʒ": 44,
104
- "ʃ": 45,
105
- "": 46,
106
- "ʒ": 47,
107
- "ɑ̃": 48,
108
- "r": 49,
109
- "x": 50,
110
- "": 51,
111
- "ɒ": 52,
112
- "a": 53,
113
- "ɑː": 54,
114
- "ɔː": 55,
115
- "əʊ": 56,
116
- "ɐ": 57,
117
- "": 58,
118
- "ʊə": 59,
119
- "": 60,
120
- "": 61,
121
- "y": 62,
122
- "ɛ̃": 63,
123
- "ʁ": 64,
124
- "e": 65,
125
- "ɔ̃": 66,
126
- "u": 67,
127
- "o": 68,
128
- "œ̃": 69,
129
- "ø": 70,
130
- "œ": 71,
131
- "": 72,
132
- "": 73,
133
- "ɲ": 74,
134
- "ts": 75,
135
- "": 76,
136
- "ʀ": 77,
137
- "ç": 78,
138
- "ɛɪ": 79,
139
- "ʏ": 80,
140
- "ɛː": 81,
141
- "pf": 82,
142
- "øː": 83,
143
- "": 84,
144
- "ɾ": 85,
145
- "β": 86,
146
- "ʎ": 87,
147
- "ɣ": 88,
148
- "ʝ": 89,
149
- "": 90,
150
- "": 91,
151
- "": 92,
152
- "ɟ": 93,
153
- "ʋ": 94,
154
- "ɪː": 95,
155
- "ɵ": 96,
156
- "œy": 97,
157
- "": 98,
158
- "au̯": 99,
159
- "ʂ": 100,
160
- "ɤ": 101,
161
- "": 102,
162
- "ʈʂʰ": 103,
163
- "ɕ": 104,
164
- "": 105,
165
- "": 106,
166
- "ʈʂ": 107,
167
- "ɹ̩": 108,
168
- "tɕʰ": 109,
169
- "": 110,
170
- "ɻ": 111,
171
- "ɥ": 112,
172
- "tsʰ": 113,
173
- "ei̯": 114,
174
- "ou̯": 115,
175
- "ɻ̩": 116,
176
- "ai̯": 117,
177
- "ɯː": 118,
178
- "ɯ": 119,
179
- "": 120,
180
- "ɸ": 121,
181
- "": 122,
182
- "": 123,
183
- "": 124,
184
- "": 125,
185
- "": 126,
186
- "æi": 127,
187
- "yi": 128,
188
- "ɵː": 129,
189
- "": 130,
190
- "æː": 131,
191
- "": 132,
192
- "": 133,
193
- "œː": 134,
194
- "ʌː": 135,
195
- "ɜ": 136,
196
- "ʔ": 137,
197
- "s̺": 138,
198
- "ts̻": 139,
199
- "": 140,
200
- "c": 141,
201
- "ts̺": 142,
202
- "tsː": 143,
203
- "ɟː": 144,
204
- "t̠ʃː": 145,
205
- "ɡː": 146,
206
- "": 147,
207
- "": 148,
208
- "": 149,
209
- "dzː": 150,
210
- "ɫ": 151,
211
- "ʊː": 152,
212
- "q": 153,
213
- "øy": 154,
214
- "χ": 155,
215
- "": 156,
216
- "": 157,
217
- "": 158,
218
- "": 159,
219
- "ɡʲ": 160,
220
- "": 161,
221
- "": 162,
222
- "çʲ": 163,
223
- "": 164,
224
- "": 165,
225
- "": 166,
226
- "": 167,
227
- "ɨ": 168,
228
- "": 169,
229
- "əɪ": 170,
230
- "ɨː": 171,
231
- "ɬ": 172,
232
- "əɨ": 173,
233
- "ɪu": 174,
234
- "ʉ": 175,
235
- "ʉː": 176,
236
- "ɑɪ": 177,
237
- "ʑ": 178,
238
- "dz": 179,
239
- "d̠ʒː": 180,
240
- "ɐ̃": 181,
241
- "ɛʊ": 182,
242
- "ũ": 183,
243
- "": 184,
244
- "": 185,
245
- "": 186,
246
- "": 187,
247
- "t̠ʃʲ": 188,
248
- "ɔa": 189,
249
- "ea": 190,
250
- "": 191,
251
- "tsʲ": 192,
252
- "eo": 193,
253
- "d̠ʒʲ": 194,
254
- "ɾʲ": 195
255
  },
256
  "unk_token": "UNK"
257
  }
 
22
  "special": true
23
  },
24
  {
25
+ "id": 3,
26
  "content": "UTT_BOUNDARY",
27
  "single_word": false,
28
  "lstrip": false,
 
34
  "normalizer": {
35
  "type": "Sequence",
36
  "normalizers": [
 
 
 
 
 
 
 
37
  {
38
  "type": "Strip",
39
  "strip_left": true,
 
44
  "pre_tokenizer": {
45
  "type": "Whitespace"
46
  },
47
+ "post_processor": {
48
+ "type": "TemplateProcessing",
49
+ "single": [
50
+ {
51
+ "SpecialToken": {
52
+ "id": "UTT_BOUNDARY",
53
+ "type_id": 0
54
+ }
55
+ },
56
+ {
57
+ "Sequence": {
58
+ "id": "A",
59
+ "type_id": 0
60
+ }
61
+ }
62
+ ],
63
+ "pair": [
64
+ {
65
+ "SpecialToken": {
66
+ "id": "UTT_BOUNDARY",
67
+ "type_id": 0
68
+ }
69
+ },
70
+ {
71
+ "Sequence": {
72
+ "id": "A",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "SpecialToken": {
78
+ "id": "UTT_BOUNDARY",
79
+ "type_id": 0
80
+ }
81
+ },
82
+ {
83
+ "Sequence": {
84
+ "id": "B",
85
+ "type_id": 1
86
+ }
87
+ }
88
+ ],
89
+ "special_tokens": {
90
+ "UTT_BOUNDARY": {
91
+ "id": "UTT_BOUNDARY",
92
+ "ids": [
93
+ 3
94
+ ],
95
+ "tokens": [
96
+ "UTT_BOUNDARY"
97
+ ]
98
+ }
99
+ }
100
+ },
101
  "decoder": null,
102
  "model": {
103
  "type": "WordLevel",
104
  "vocab": {
105
  "UNK": 0,
106
  "PAD": 1,
107
+ "WORD_BOUNDARY": 2,
108
+ "UTT_BOUNDARY": 3,
109
+ "s": 4,
110
+ "": 5,
111
+ "ð": 6,
112
+ "ɛ": 7,
113
+ "ɹ": 8,
114
+ "z": 9,
115
+ "ʌ": 10,
116
+ "f": 11,
117
+ "": 12,
118
+ "w": 13,
119
+ "ɪ": 14,
120
+ "ɡ": 15,
121
+ "l": 16,
122
+ "æ": 17,
123
+ "ɑ": 18,
124
+ "h": 19,
125
+ "ə": 20,
126
+ "ʊ": 21,
127
+ "k": 22,
128
+ "p": 23,
129
+ "": 24,
130
+ "b": 25,
131
+ "i": 26,
132
+ "t": 27,
133
+ "": 28,
134
+ "θ": 29,
135
+ "ŋ": 30,
136
+ "j": 31,
137
+ "ɔ": 32,
138
+ "m": 33,
139
+ "ɔɪ": 34,
140
+ "n": 35,
141
+ "d": 36,
142
+ "": 37,
143
+ "": 38,
144
+ "v": 39,
145
+ "ɜː": 40,
146
+ "t̠ʃ": 41,
147
+ "d̠ʒ": 42,
148
+ "ʃ": 43,
149
+ "": 44,
150
+ "ʒ": 45,
151
+ "ɑ̃": 46,
152
+ "r": 47,
153
+ "x": 48,
154
+ "": 49,
155
+ "ɒ": 50,
156
+ "a": 51,
157
+ "ɑː": 52,
158
+ "ɔː": 53,
159
+ "əʊ": 54,
160
+ "ɐ": 55,
161
+ "": 56,
162
+ "ʊə": 57,
163
+ "": 58,
164
+ "": 59,
165
+ "y": 60,
166
+ "ɛ̃": 61,
167
+ "ʁ": 62,
168
+ "e": 63,
169
+ "ɔ̃": 64,
170
+ "u": 65,
171
+ "o": 66,
172
+ "œ̃": 67,
173
+ "ø": 68,
174
+ "œ": 69,
175
+ "": 70,
176
+ "": 71,
177
+ "ɲ": 72,
178
+ "ts": 73,
179
+ "": 74,
180
+ "ʀ": 75,
181
+ "ç": 76,
182
+ "ɛɪ": 77,
183
+ "ʏ": 78,
184
+ "ɛː": 79,
185
+ "pf": 80,
186
+ "øː": 81,
187
+ "": 82,
188
+ "ɾ": 83,
189
+ "β": 84,
190
+ "ʎ": 85,
191
+ "ɣ": 86,
192
+ "ʝ": 87,
193
+ "": 88,
194
+ "": 89,
195
+ "": 90,
196
+ "ɟ": 91,
197
+ "ʋ": 92,
198
+ "ɪː": 93,
199
+ "ɵ": 94,
200
+ "œy": 95,
201
+ "": 96,
202
+ "au̯": 97,
203
+ "ʂ": 98,
204
+ "ɤ": 99,
205
+ "": 100,
206
+ "ʈʂʰ": 101,
207
+ "ɕ": 102,
208
+ "": 103,
209
+ "": 104,
210
+ "ʈʂ": 105,
211
+ "ɹ̩": 106,
212
+ "tɕʰ": 107,
213
+ "": 108,
214
+ "ɻ": 109,
215
+ "ɥ": 110,
216
+ "tsʰ": 111,
217
+ "ei̯": 112,
218
+ "ou̯": 113,
219
+ "ɻ̩": 114,
220
+ "ai̯": 115,
221
+ "ɯː": 116,
222
+ "ɯ": 117,
223
+ "": 118,
224
+ "ɸ": 119,
225
+ "": 120,
226
+ "": 121,
227
+ "": 122,
228
+ "": 123,
229
+ "": 124,
230
+ "æi": 125,
231
+ "yi": 126,
232
+ "ɵː": 127,
233
+ "": 128,
234
+ "æː": 129,
235
+ "": 130,
236
+ "": 131,
237
+ "œː": 132,
238
+ "ʌː": 133,
239
+ "ɜ": 134,
240
+ "ʔ": 135,
241
+ "": 136,
242
+ "ts̻": 137,
243
+ "s̻": 138,
244
+ "c": 139,
245
+ "ts̺": 140,
246
+ "tsː": 141,
247
+ "ɟː": 142,
248
+ "t̠ʃː": 143,
249
+ "ɡː": 144,
250
+ "": 145,
251
+ "": 146,
252
+ "": 147,
253
+ "dzː": 148,
254
+ "ɫ": 149,
255
+ "ʊː": 150,
256
+ "q": 151,
257
+ "øy": 152,
258
+ "χ": 153,
259
+ "": 154,
260
+ "": 155,
261
+ "": 156,
262
+ "": 157,
263
+ "ɡʲ": 158,
264
+ "": 159,
265
+ "": 160,
266
+ "çʲ": 161,
267
+ "": 162,
268
+ "": 163,
269
+ "": 164,
270
+ "": 165,
271
+ "ɨ": 166,
272
+ "": 167,
273
+ "əɪ": 168,
274
+ "ɨː": 169,
275
+ "ɬ": 170,
276
+ "əɨ": 171,
277
+ "ɪu": 172,
278
+ "ʉ": 173,
279
+ "ʉː": 174,
280
+ "ɑɪ": 175,
281
+ "ʑ": 176,
282
+ "dz": 177,
283
+ "d̠ʒː": 178,
284
+ "ɐ̃": 179,
285
+ "ɛʊ": 180,
286
+ "": 181,
287
+ "": 182,
288
+ "õ": 183,
289
+ "": 184,
290
+ "": 185,
291
+ "t̠ʃʲ": 186,
292
+ "ɔa": 187,
293
+ "ea": 188,
294
+ "": 189,
295
+ "tsʲ": 190,
296
+ "eo": 191,
297
+ "d̠ʒʲ": 192,
298
+ "ɾʲ": 193
 
 
299
  },
300
  "unk_token": "UNK"
301
  }
tokenizer_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "single_word": false,
18
  "special": true
19
  },
20
- "5": {
21
  "content": "UTT_BOUNDARY",
22
  "lstrip": false,
23
  "normalized": false,
 
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "3": {
21
  "content": "UTT_BOUNDARY",
22
  "lstrip": false,
23
  "normalized": false,
vocab.json CHANGED
@@ -1 +1 @@
1
- {"UNK":0,"PAD":1,"BOS":2,"EOS":3,"WORD_BOUNDARY":4,"UTT_BOUNDARY":5,"s":6,"":7,"ð":8,"ɛ":9,"ɹ":10,"z":11,"ʌ":12,"f":13,"":14,"w":15,"ɪ":16,"ɡ":17,"l":18,"æ":19,"ɑ":20,"h":21,"ə":22,"ʊ":23,"k":24,"p":25,"":26,"b":27,"i":28,"t":29,"":30,"θ":31,"ŋ":32,"j":33,"ɔ":34,"m":35,"ɔɪ":36,"n":37,"d":38,"":39,"":40,"v":41,"ɜː":42,"t̠ʃ":43,"d̠ʒ":44,"ʃ":45,"iə":46,"ʒ":47,"ɑ̃":48,"r":49,"x":50,"nʲ":51,"ɒ":52,"a":53,"ɑː":54,"ɔː":55,"əʊ":56,"ɐ":57,"eə":58,"ʊə":59,"n̩":60,"aː":61,"y":62,"ɛ̃":63,"ʁ":64,"e":65,"ɔ̃":66,"u":67,"o":68,"œ̃":69,"ø":70,"œ":71,"oː":72,"yː":73,"ɲ":74,"ts":75,"eː":76,"ʀ":77,"ç":78,"ɛɪ":79,"ʏ":80,"ɛː":81,"pf":82,"øː":83,"ã":84,"ɾ":85,"β":86,"ʎ":87,"ɣ":88,"ʝ":89,"oɪ":90,"eʊ":91,"pː":92,"ɟ":93,"ʋ":94,"ɪː":95,"ɵ":96,"œy":97,"tʲ":98,"au̯":99,"ʂ":100,"ɤ":101,"kʰ":102,"ʈʂʰ":103,"ɕ":104,"pʰ":105,"tɕ":106,"ʈʂ":107,"ɹ̩":108,"tɕʰ":109,"tʰ":110,"ɻ":111,"ɥ":112,"tsʰ":113,"ei̯":114,"ou̯":115,"ɻ̩":116,"ai̯":117,"ɯː":118,"ɯ":119,"pʲ":120,"ɸ":121,"rʲ":122,"kʲ":123,"bʲ":124,"mʲ":125,"kː":126,"æi":127,"yi":128,"ɵː":129,"tː":130,"æː":131,"dʑ":132,"l̩":133,"œː":134,"ʌː":135,"ɜ":136,"ʔ":137,"s̺":138,"ts̻":139,"s̻":140,"c":141,"ts̺":142,"tsː":143,"ɟː":144,"t̠ʃː":145,"ɡː":146,"dː":147,"cː":148,"bː":149,"dzː":150,"ɫ":151,"ʊː":152,"q":153,"øy":154,"χ":155,"i̯":156,"t̪":157,"d̪":158,"lʲ":159,"ɡʲ":160,"hʲ":161,"dʲ":162,"çʲ":163,"uə":164,"ŭ":165,"fʲ":166,"aɨ":167,"ɨ":168,"uɨ":169,"əɪ":170,"ɨː":171,"ɬ":172,"əɨ":173,"ɪu":174,"ʉ":175,"ʉː":176,"ɑɪ":177,"ʑ":178,"dz":179,"d̠ʒː":180,"ɐ̃":181,"ɛʊ":182,"ũ":183,"iʊ":184,"õ":185,"uɪ":186,"sʲ":187,"t̠ʃʲ":188,"ɔa":189,"ea":190,"iɪ":191,"tsʲ":192,"eo":193,"d̠ʒʲ":194,"ɾʲ":195}
 
1
+ {"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"s":4,"":5,"ð":6,"ɛ":7,"ɹ":8,"z":9,"ʌ":10,"f":11,"":12,"w":13,"ɪ":14,"ɡ":15,"l":16,"æ":17,"ɑ":18,"h":19,"ə":20,"ʊ":21,"k":22,"p":23,"":24,"b":25,"i":26,"t":27,"":28,"θ":29,"ŋ":30,"j":31,"ɔ":32,"m":33,"ɔɪ":34,"n":35,"d":36,"":37,"":38,"v":39,"ɜː":40,"t̠ʃ":41,"d̠ʒ":42,"ʃ":43,"iə":44,"ʒ":45,"ɑ̃":46,"r":47,"x":48,"nʲ":49,"ɒ":50,"a":51,"ɑː":52,"ɔː":53,"əʊ":54,"ɐ":55,"eə":56,"ʊə":57,"n̩":58,"aː":59,"y":60,"ɛ̃":61,"ʁ":62,"e":63,"ɔ̃":64,"u":65,"o":66,"œ̃":67,"ø":68,"œ":69,"oː":70,"yː":71,"ɲ":72,"ts":73,"eː":74,"ʀ":75,"ç":76,"ɛɪ":77,"ʏ":78,"ɛː":79,"pf":80,"øː":81,"ã":82,"ɾ":83,"β":84,"ʎ":85,"ɣ":86,"ʝ":87,"oɪ":88,"eʊ":89,"pː":90,"ɟ":91,"ʋ":92,"ɪː":93,"ɵ":94,"œy":95,"tʲ":96,"au̯":97,"ʂ":98,"ɤ":99,"kʰ":100,"ʈʂʰ":101,"ɕ":102,"pʰ":103,"tɕ":104,"ʈʂ":105,"ɹ̩":106,"tɕʰ":107,"tʰ":108,"ɻ":109,"ɥ":110,"tsʰ":111,"ei̯":112,"ou̯":113,"ɻ̩":114,"ai̯":115,"ɯː":116,"ɯ":117,"pʲ":118,"ɸ":119,"rʲ":120,"kʲ":121,"bʲ":122,"mʲ":123,"kː":124,"æi":125,"yi":126,"ɵː":127,"tː":128,"æː":129,"dʑ":130,"l̩":131,"œː":132,"ʌː":133,"ɜ":134,"ʔ":135,"s̺":136,"ts̻":137,"s̻":138,"c":139,"ts̺":140,"tsː":141,"ɟː":142,"t̠ʃː":143,"ɡː":144,"dː":145,"cː":146,"bː":147,"dzː":148,"ɫ":149,"ʊː":150,"q":151,"øy":152,"χ":153,"i̯":154,"t̪":155,"d̪":156,"lʲ":157,"ɡʲ":158,"hʲ":159,"dʲ":160,"çʲ":161,"uə":162,"ŭ":163,"fʲ":164,"aɨ":165,"ɨ":166,"uɨ":167,"əɪ":168,"ɨː":169,"ɬ":170,"əɨ":171,"ɪu":172,"ʉ":173,"ʉː":174,"ɑɪ":175,"ʑ":176,"dz":177,"d̠ʒː":178,"ɐ̃":179,"ɛʊ":180,"ũ":181,"iʊ":182,"õ":183,"uɪ":184,"sʲ":185,"t̠ʃʲ":186,"ɔa":187,"ea":188,"iɪ":189,"tsʲ":190,"eo":191,"d̠ʒʲ":192,"ɾʲ":193}