Update README.md
Browse files
README.md
CHANGED
@@ -12,25 +12,379 @@ language:
|
|
12 |
- ja
|
13 |
---
|
14 |
|
15 |
-
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
16 |
-
should probably proofread and complete it, then remove this comment. -->
|
17 |
-
|
18 |
# Hibiki ASR Phonemizer
|
19 |
|
20 |
This model is a Phoneme Level Speech Recognition network, originally a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on a
|
21 |
mixture of Different Japanese datasets.
|
22 |
-
|
23 |
-
it
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
27 |
evaluation set:
|
28 |
- Loss: 0.2186
|
29 |
- Wer: 21.6707
|
30 |
|
31 |
-
## Model description
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
## Intended uses & limitations
|
36 |
|
|
|
12 |
- ja
|
13 |
---
|
14 |
|
|
|
|
|
|
|
15 |
# Hibiki ASR Phonemizer
|
16 |
|
17 |
This model is a Phoneme Level Speech Recognition network, originally a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on a
|
18 |
mixture of Different Japanese datasets.
|
19 |
+
|
20 |
+
it can detect, transcribe and do the following:
|
21 |
+
|
22 |
+
- non-speech sounds such as gasp, erotic moans, etc.
|
23 |
+
- adding punctuations more faithfully.
|
24 |
+
|
25 |
+
a Grapheme (aka normal Japanese output) will probably be trained as well.
|
26 |
|
27 |
|
28 |
evaluation set:
|
29 |
- Loss: 0.2186
|
30 |
- Wer: 21.6707
|
31 |
|
|
|
32 |
|
33 |
+
## Inference and Post-proc
|
34 |
+
|
35 |
+
```python
|
36 |
+
|
37 |
+
# this function was borrowed and modified from Aaron Yinghao Li, the Author of StyleTTS paper.
|
38 |
+
|
39 |
+
kana_mapper = dict([
|
40 |
+
("ゔぁ","ba"),
|
41 |
+
("ゔぃ","bi"),
|
42 |
+
("ゔぇ","be"),
|
43 |
+
("ゔぉ","bo"),
|
44 |
+
("ゔゃ","bʲa"),
|
45 |
+
("ゔゅ","bʲɯ"),
|
46 |
+
("ゔゃ","bʲa"),
|
47 |
+
("ゔょ","bʲo"),
|
48 |
+
|
49 |
+
("ゔ","bɯ"),
|
50 |
+
|
51 |
+
("あぁ"," aː"),
|
52 |
+
("いぃ"," iː"),
|
53 |
+
("いぇ"," je"),
|
54 |
+
("いゃ"," ja"),
|
55 |
+
("うぅ"," ɯː"),
|
56 |
+
("えぇ"," eː"),
|
57 |
+
("おぉ"," oː"),
|
58 |
+
("かぁ"," kaː"),
|
59 |
+
("きぃ"," kiː"),
|
60 |
+
("くぅ","kɯː"),
|
61 |
+
("くゃ","ka"),
|
62 |
+
("くゅ","kʲɯ"),
|
63 |
+
("くょ","kʲo"),
|
64 |
+
("けぇ","keː"),
|
65 |
+
("こぉ","koː"),
|
66 |
+
("がぁ","gaː"),
|
67 |
+
("ぎぃ","giː"),
|
68 |
+
("ぐぅ","gɯː"),
|
69 |
+
("ぐゃ","gʲa"),
|
70 |
+
("ぐゅ","gʲɯ"),
|
71 |
+
("ぐょ","gʲo"),
|
72 |
+
("げぇ","geː"),
|
73 |
+
("ごぉ","goː"),
|
74 |
+
("さぁ","saː"),
|
75 |
+
("しぃ","ɕiː"),
|
76 |
+
("すぅ","sɯː"),
|
77 |
+
("すゃ","sʲa"),
|
78 |
+
("すゅ","sʲɯ"),
|
79 |
+
("すょ","sʲo"),
|
80 |
+
("せぇ","seː"),
|
81 |
+
("そぉ","soː"),
|
82 |
+
("ざぁ","zaː"),
|
83 |
+
("じぃ","dʑiː"),
|
84 |
+
("ずぅ","zɯː"),
|
85 |
+
("ずゃ","zʲa"),
|
86 |
+
("ずゅ","zʲɯ"),
|
87 |
+
("ずょ","zʲo"),
|
88 |
+
("ぜぇ","zeː"),
|
89 |
+
("ぞぉ","zeː"),
|
90 |
+
("たぁ","taː"),
|
91 |
+
("ちぃ","tɕiː"),
|
92 |
+
("つぁ","tsa"),
|
93 |
+
("つぃ","tsi"),
|
94 |
+
("つぅ","tsɯː"),
|
95 |
+
("つゃ","tɕa"),
|
96 |
+
("つゅ","tɕɯ"),
|
97 |
+
("つょ","tɕo"),
|
98 |
+
("つぇ","tse"),
|
99 |
+
("つぉ","tso"),
|
100 |
+
("てぇ","teː"),
|
101 |
+
("とぉ","toː"),
|
102 |
+
("だぁ","daː"),
|
103 |
+
("ぢぃ","dʑiː"),
|
104 |
+
("づぅ","dɯː"),
|
105 |
+
("づゃ","zʲa"),
|
106 |
+
("づゅ","zʲɯ"),
|
107 |
+
("づょ","zʲo"),
|
108 |
+
("でぇ","deː"),
|
109 |
+
("どぉ","doː"),
|
110 |
+
("なぁ","naː"),
|
111 |
+
("にぃ","niː"),
|
112 |
+
("ぬぅ","nɯː"),
|
113 |
+
("ぬゃ","nʲa"),
|
114 |
+
("ぬゅ","nʲɯ"),
|
115 |
+
("ぬょ","nʲo"),
|
116 |
+
("ねぇ","neː"),
|
117 |
+
("のぉ","noː"),
|
118 |
+
("はぁ","haː"),
|
119 |
+
("ひぃ","çiː"),
|
120 |
+
("ふぅ","ɸɯː"),
|
121 |
+
("ふゃ","ɸʲa"),
|
122 |
+
("ふゅ","ɸʲɯ"),
|
123 |
+
("ふょ","ɸʲo"),
|
124 |
+
("へぇ","heː"),
|
125 |
+
("ほぉ","hoː"),
|
126 |
+
("ばぁ","baː"),
|
127 |
+
("びぃ","biː"),
|
128 |
+
("ぶぅ","bɯː"),
|
129 |
+
("ふゃ","ɸʲa"),
|
130 |
+
("ぶゅ","bʲɯ"),
|
131 |
+
("ふょ","ɸʲo"),
|
132 |
+
("べぇ","beː"),
|
133 |
+
("ぼぉ","boː"),
|
134 |
+
("ぱぁ","paː"),
|
135 |
+
("ぴぃ","piː"),
|
136 |
+
("ぷぅ","pɯː"),
|
137 |
+
("ぷゃ","pʲa"),
|
138 |
+
("ぷゅ","pʲɯ"),
|
139 |
+
("ぷょ","pʲo"),
|
140 |
+
("ぺぇ","peː"),
|
141 |
+
("ぽぉ","poː"),
|
142 |
+
("まぁ","maː"),
|
143 |
+
("みぃ","miː"),
|
144 |
+
("むぅ","mɯː"),
|
145 |
+
("むゃ","mʲa"),
|
146 |
+
("むゅ","mʲɯ"),
|
147 |
+
("むょ","mʲo"),
|
148 |
+
("めぇ","meː"),
|
149 |
+
("もぉ","moː"),
|
150 |
+
("やぁ","jaː"),
|
151 |
+
("ゆぅ","jɯː"),
|
152 |
+
("ゆゃ","jaː"),
|
153 |
+
("ゆゅ","jɯː"),
|
154 |
+
("ゆょ","joː"),
|
155 |
+
("よぉ","joː"),
|
156 |
+
("らぁ","ɽaː"),
|
157 |
+
("りぃ","ɽiː"),
|
158 |
+
("るぅ","ɽɯː"),
|
159 |
+
("るゃ","ɽʲa"),
|
160 |
+
("るゅ","ɽʲɯ"),
|
161 |
+
("るょ","ɽʲo"),
|
162 |
+
("れぇ","ɽeː"),
|
163 |
+
("ろぉ","ɽoː"),
|
164 |
+
("わぁ","ɯaː"),
|
165 |
+
("をぉ","oː"),
|
166 |
+
|
167 |
+
("う゛","bɯ"),
|
168 |
+
("でぃ","di"),
|
169 |
+
("でぇ","deː"),
|
170 |
+
("でゃ","dʲa"),
|
171 |
+
("でゅ","dʲɯ"),
|
172 |
+
("でょ","dʲo"),
|
173 |
+
("てぃ","ti"),
|
174 |
+
("てぇ","teː"),
|
175 |
+
("てゃ","tʲa"),
|
176 |
+
("てゅ","tʲɯ"),
|
177 |
+
("てょ","tʲo"),
|
178 |
+
("すぃ","si"),
|
179 |
+
("ずぁ","zɯa"),
|
180 |
+
("ずぃ","zi"),
|
181 |
+
("ずぅ","zɯ"),
|
182 |
+
("ずゃ","zʲa"),
|
183 |
+
("ずゅ","zʲɯ"),
|
184 |
+
("ずょ","zʲo"),
|
185 |
+
("ずぇ","ze"),
|
186 |
+
("ずぉ","zo"),
|
187 |
+
("きゃ","kʲa"),
|
188 |
+
("きゅ","kʲɯ"),
|
189 |
+
("きょ","kʲo"),
|
190 |
+
("しゃ","ɕʲa"),
|
191 |
+
("しゅ","ɕʲɯ"),
|
192 |
+
("しぇ","ɕʲe"),
|
193 |
+
("しょ","ɕʲo"),
|
194 |
+
("ちゃ","tɕa"),
|
195 |
+
("ちゅ","tɕɯ"),
|
196 |
+
("ちぇ","tɕe"),
|
197 |
+
("ちょ","tɕo"),
|
198 |
+
("とぅ","tɯ"),
|
199 |
+
("とゃ","tʲa"),
|
200 |
+
("とゅ","tʲɯ"),
|
201 |
+
("とょ","tʲo"),
|
202 |
+
("どぁ","doa"),
|
203 |
+
("どぅ","dɯ"),
|
204 |
+
("どゃ","dʲa"),
|
205 |
+
("どゅ","dʲɯ"),
|
206 |
+
("��ょ","dʲo"),
|
207 |
+
("どぉ","doː"),
|
208 |
+
("にゃ","nʲa"),
|
209 |
+
("にゅ","nʲɯ"),
|
210 |
+
("にょ","nʲo"),
|
211 |
+
("ひゃ","çʲa"),
|
212 |
+
("ひゅ","çʲɯ"),
|
213 |
+
("ひょ","çʲo"),
|
214 |
+
("みゃ","mʲa"),
|
215 |
+
("みゅ","mʲɯ"),
|
216 |
+
("みょ","mʲo"),
|
217 |
+
("りゃ","ɽʲa"),
|
218 |
+
("りぇ","ɽʲe"),
|
219 |
+
("りゅ","ɽʲɯ"),
|
220 |
+
("りょ","ɽʲo"),
|
221 |
+
("ぎゃ","gʲa"),
|
222 |
+
("ぎゅ","gʲɯ"),
|
223 |
+
("ぎょ","gʲo"),
|
224 |
+
("ぢぇ","dʑe"),
|
225 |
+
("ぢゃ","dʑa"),
|
226 |
+
("ぢゅ","dʑɯ"),
|
227 |
+
("ぢょ","dʑo"),
|
228 |
+
("じぇ","dʑe"),
|
229 |
+
("じゃ","dʑa"),
|
230 |
+
("じゅ","dʑɯ"),
|
231 |
+
("じょ","dʑo"),
|
232 |
+
("びゃ","bʲa"),
|
233 |
+
("びゅ","bʲɯ"),
|
234 |
+
("びょ","bʲo"),
|
235 |
+
("ぴゃ","pʲa"),
|
236 |
+
("ぴゅ","pʲɯ"),
|
237 |
+
("ぴょ","pʲo"),
|
238 |
+
("うぁ","ɯa"),
|
239 |
+
("うぃ","ɯi"),
|
240 |
+
("うぇ","ɯe"),
|
241 |
+
("うぉ","ɯo"),
|
242 |
+
("うゃ","ɯʲa"),
|
243 |
+
("うゅ","ɯʲɯ"),
|
244 |
+
("うょ","ɯʲo"),
|
245 |
+
("ふぁ","ɸa"),
|
246 |
+
("ふぃ","ɸi"),
|
247 |
+
("ふぅ","ɸɯ"),
|
248 |
+
("ふゃ","ɸʲa"),
|
249 |
+
("ふゅ","ɸʲɯ"),
|
250 |
+
("ふょ","ɸʲo"),
|
251 |
+
("ふぇ","ɸe"),
|
252 |
+
("ふぉ","ɸo"),
|
253 |
+
|
254 |
+
("あ"," a"),
|
255 |
+
("い"," i"),
|
256 |
+
("う","ɯ"),
|
257 |
+
("え"," e"),
|
258 |
+
("お"," o"),
|
259 |
+
("か"," ka"),
|
260 |
+
("き"," ki"),
|
261 |
+
("く"," kɯ"),
|
262 |
+
("け"," ke"),
|
263 |
+
("こ"," ko"),
|
264 |
+
("さ"," sa"),
|
265 |
+
("し"," ɕi"),
|
266 |
+
("す"," sɯ"),
|
267 |
+
("せ"," se"),
|
268 |
+
("そ"," so"),
|
269 |
+
("た"," ta"),
|
270 |
+
("ち"," tɕi"),
|
271 |
+
("つ"," tsɯ"),
|
272 |
+
("て"," te"),
|
273 |
+
("と"," to"),
|
274 |
+
("な"," na"),
|
275 |
+
("に"," ni"),
|
276 |
+
("ぬ"," nɯ"),
|
277 |
+
("ね"," ne"),
|
278 |
+
("の"," no"),
|
279 |
+
("は"," ha"),
|
280 |
+
("ひ"," çi"),
|
281 |
+
("ふ"," ɸɯ"),
|
282 |
+
("へ"," he"),
|
283 |
+
("ほ"," ho"),
|
284 |
+
("ま"," ma"),
|
285 |
+
("み"," mi"),
|
286 |
+
("む"," mɯ"),
|
287 |
+
("め"," me"),
|
288 |
+
("も"," mo"),
|
289 |
+
("ら"," ɽa"),
|
290 |
+
("り"," ɽi"),
|
291 |
+
("る"," ɽɯ"),
|
292 |
+
("れ"," ɽe"),
|
293 |
+
("ろ"," ɽo"),
|
294 |
+
("が"," ga"),
|
295 |
+
("ぎ"," gi"),
|
296 |
+
("ぐ"," gɯ"),
|
297 |
+
("げ"," ge"),
|
298 |
+
("ご"," go"),
|
299 |
+
("ざ"," za"),
|
300 |
+
("じ"," dʑi"),
|
301 |
+
("ず"," zɯ"),
|
302 |
+
("ぜ"," ze"),
|
303 |
+
("ぞ"," zo"),
|
304 |
+
("だ"," da"),
|
305 |
+
("ぢ"," dʑi"),
|
306 |
+
("づ"," zɯ"),
|
307 |
+
("で"," de"),
|
308 |
+
("ど"," do"),
|
309 |
+
("ば"," ba"),
|
310 |
+
("び"," bi"),
|
311 |
+
("ぶ"," bɯ"),
|
312 |
+
("べ"," be"),
|
313 |
+
("ぼ"," bo"),
|
314 |
+
("ぱ"," pa"),
|
315 |
+
("ぴ"," pi"),
|
316 |
+
("ぷ"," pɯ"),
|
317 |
+
("ぺ"," pe"),
|
318 |
+
("ぽ"," po"),
|
319 |
+
("や"," ja"),
|
320 |
+
("ゆ"," jɯ"),
|
321 |
+
("よ"," jo"),
|
322 |
+
("わ"," ɯa"),
|
323 |
+
("ゐ"," i"),
|
324 |
+
("ゑ"," e"),
|
325 |
+
("ん"," ɴ"),
|
326 |
+
("っ"," ʔ"),
|
327 |
+
("ー"," ː"),
|
328 |
+
|
329 |
+
("ぁ"," a"),
|
330 |
+
("ぃ"," i"),
|
331 |
+
("ぅ"," ɯ"),
|
332 |
+
("ぇ"," e"),
|
333 |
+
("ぉ"," o"),
|
334 |
+
("ゎ"," ɯa"),
|
335 |
+
("ぉ"," o"),
|
336 |
+
|
337 |
+
("を","o")
|
338 |
+
])
|
339 |
+
|
340 |
+
|
341 |
+
def post_fix(text):
|
342 |
+
orig = text
|
343 |
+
|
344 |
+
for k, v in kana_mapper.items():
|
345 |
+
text = text.replace(k, v)
|
346 |
+
|
347 |
+
return text
|
348 |
+
|
349 |
+
from datasets import Dataset, Audio
|
350 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
351 |
+
|
352 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
|
353 |
+
model = WhisperForConditionalGeneration.from_pretrained("Respair/Hibiki_ASR_Phonemizer").to("cuda:0")
|
354 |
+
|
355 |
+
forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe", language='japanese')
|
356 |
+
|
357 |
+
|
358 |
+
|
359 |
+
sample = Dataset.from_dict({"audio": ["/content/kl_chunk1987.wav"]}).cast_column("audio", Audio(16000))
|
360 |
+
sample = sample[0]['audio']
|
361 |
+
|
362 |
+
# Ensure the input features are on the same device as the model
|
363 |
+
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features.to("cuda:0")
|
364 |
+
|
365 |
+
# generate token ids
|
366 |
+
predicted_ids = model.generate(input_features,forced_decoder_ids=forced_decoder_ids, repetition_penalty=1.2)
|
367 |
+
# decode token ids to text
|
368 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
369 |
+
if "ki ni ɕinai" in transcription[0]:
|
370 |
+
transcription[0] = transcription[0].replace("ki ni ɕinai", " ki ni ɕinai")
|
371 |
+
|
372 |
+
if ' ʔt' in transcription[0]:
|
373 |
+
transcription[0] = transcription[0].replace(' ʔt', "ʔt")
|
374 |
+
|
375 |
+
if ' neɽitai ' in transcription[0]:
|
376 |
+
transcription[0] = transcription[0].replace(' neɽitai ', "naɽitai")
|
377 |
+
if 'harɯdʑisama' in transcription[0]:
|
378 |
+
transcription[0] = transcription[0].replace('harɯdʑisama', "arɯdʑisama")
|
379 |
+
|
380 |
+
if 'de aɽoɯ' in transcription[0]:
|
381 |
+
transcription[0] = transcription[0].replace('de aɽoɯ', " de aɽoɯ")
|
382 |
+
|
383 |
+
|
384 |
+
|
385 |
+
post_fix(transcription[0].lstrip())
|
386 |
+
|
387 |
+
```
|
388 |
|
389 |
## Intended uses & limitations
|
390 |
|