Added fugashi sample code.
Browse files- fine-tune-whisper-streaming.ipynb +303 -10
fine-tune-whisper-streaming.ipynb
CHANGED
@@ -306,7 +306,7 @@
|
|
306 |
},
|
307 |
{
|
308 |
"cell_type": "code",
|
309 |
-
"execution_count":
|
310 |
"id": "c085911c-a10a-41ef-8874-306e0503e9bb",
|
311 |
"metadata": {},
|
312 |
"outputs": [],
|
@@ -328,7 +328,8 @@
|
|
328 |
" transcription = normalizer(transcription).strip()\n",
|
329 |
" \n",
|
330 |
" # encode target text to label ids\n",
|
331 |
-
"
|
|
|
332 |
" return batch"
|
333 |
]
|
334 |
},
|
@@ -342,7 +343,7 @@
|
|
342 |
},
|
343 |
{
|
344 |
"cell_type": "code",
|
345 |
-
"execution_count":
|
346 |
"id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
|
347 |
"metadata": {},
|
348 |
"outputs": [],
|
@@ -360,7 +361,7 @@
|
|
360 |
},
|
361 |
{
|
362 |
"cell_type": "code",
|
363 |
-
"execution_count":
|
364 |
"id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
|
365 |
"metadata": {},
|
366 |
"outputs": [],
|
@@ -381,7 +382,7 @@
|
|
381 |
},
|
382 |
{
|
383 |
"cell_type": "code",
|
384 |
-
"execution_count":
|
385 |
"id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
|
386 |
"metadata": {},
|
387 |
"outputs": [],
|
@@ -402,7 +403,7 @@
|
|
402 |
},
|
403 |
{
|
404 |
"cell_type": "code",
|
405 |
-
"execution_count":
|
406 |
"id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
|
407 |
"metadata": {},
|
408 |
"outputs": [],
|
@@ -413,14 +414,252 @@
|
|
413 |
")"
|
414 |
]
|
415 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
{
|
417 |
"cell_type": "code",
|
418 |
"execution_count": null,
|
419 |
-
"id": "
|
420 |
"metadata": {},
|
421 |
"outputs": [],
|
422 |
"source": [
|
423 |
-
"
|
424 |
]
|
425 |
},
|
426 |
{
|
@@ -895,7 +1134,7 @@
|
|
895 |
"execution_count": 26,
|
896 |
"id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
|
897 |
"metadata": {
|
898 |
-
"scrolled":
|
899 |
},
|
900 |
"outputs": [
|
901 |
{
|
@@ -1139,7 +1378,7 @@
|
|
1139 |
},
|
1140 |
{
|
1141 |
"cell_type": "code",
|
1142 |
-
"execution_count":
|
1143 |
"id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
|
1144 |
"metadata": {},
|
1145 |
"outputs": [
|
@@ -1155,6 +1394,60 @@
|
|
1155 |
"Special tokens file saved in ./special_tokens_map.json\n",
|
1156 |
"added tokens file saved in ./added_tokens.json\n"
|
1157 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1158 |
}
|
1159 |
],
|
1160 |
"source": [
|
|
|
306 |
},
|
307 |
{
|
308 |
"cell_type": "code",
|
309 |
+
"execution_count": 44,
|
310 |
"id": "c085911c-a10a-41ef-8874-306e0503e9bb",
|
311 |
"metadata": {},
|
312 |
"outputs": [],
|
|
|
328 |
" transcription = normalizer(transcription).strip()\n",
|
329 |
" \n",
|
330 |
" # encode target text to label ids\n",
|
331 |
+
"# batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
|
332 |
+
" batch['labels'] = transcription\n",
|
333 |
" return batch"
|
334 |
]
|
335 |
},
|
|
|
343 |
},
|
344 |
{
|
345 |
"cell_type": "code",
|
346 |
+
"execution_count": 45,
|
347 |
"id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
|
348 |
"metadata": {},
|
349 |
"outputs": [],
|
|
|
361 |
},
|
362 |
{
|
363 |
"cell_type": "code",
|
364 |
+
"execution_count": 46,
|
365 |
"id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
|
366 |
"metadata": {},
|
367 |
"outputs": [],
|
|
|
382 |
},
|
383 |
{
|
384 |
"cell_type": "code",
|
385 |
+
"execution_count": 47,
|
386 |
"id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
|
387 |
"metadata": {},
|
388 |
"outputs": [],
|
|
|
403 |
},
|
404 |
{
|
405 |
"cell_type": "code",
|
406 |
+
"execution_count": 48,
|
407 |
"id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
|
408 |
"metadata": {},
|
409 |
"outputs": [],
|
|
|
414 |
")"
|
415 |
]
|
416 |
},
|
417 |
+
{
|
418 |
+
"cell_type": "code",
|
419 |
+
"execution_count": 49,
|
420 |
+
"id": "bede1184",
|
421 |
+
"metadata": {},
|
422 |
+
"outputs": [
|
423 |
+
{
|
424 |
+
"name": "stderr",
|
425 |
+
"output_type": "stream",
|
426 |
+
"text": [
|
427 |
+
"Reading metadata...: 6505it [00:00, 35406.66it/s]\n",
|
428 |
+
"Reading metadata...: 4485it [00:00, 19930.24it/s]\n"
|
429 |
+
]
|
430 |
+
},
|
431 |
+
{
|
432 |
+
"data": {
|
433 |
+
"text/plain": [
|
434 |
+
"'ๅคใใไธใธใจใใใฎใฏใไธ็ใๅ ๆ็ใซๆฑบๅฎ่ซ็ใซ่ใใใใจใงใใใ้ๅปใใ่ใใใใจใงใใใๆฉๆขฐ็ใซ่ใใใใจใงใใใ'"
|
435 |
+
]
|
436 |
+
},
|
437 |
+
"execution_count": 49,
|
438 |
+
"metadata": {},
|
439 |
+
"output_type": "execute_result"
|
440 |
+
}
|
441 |
+
],
|
442 |
+
"source": [
|
443 |
+
"xb = next(iter(vectorized_datasets['train']))\n",
|
444 |
+
"xb['labels']"
|
445 |
+
]
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": 59,
|
450 |
+
"id": "ac1e8d5b",
|
451 |
+
"metadata": {},
|
452 |
+
"outputs": [
|
453 |
+
{
|
454 |
+
"name": "stdout",
|
455 |
+
"output_type": "stream",
|
456 |
+
"text": [
|
457 |
+
"<|startoftranscript|>\n",
|
458 |
+
"<|ja|>\n",
|
459 |
+
"<|transcribe|>\n",
|
460 |
+
"<|notimestamps|>\n",
|
461 |
+
"ๅค\n",
|
462 |
+
"ใใ\n",
|
463 |
+
"ไธ\n",
|
464 |
+
"ใธ\n",
|
465 |
+
"ใจใใ\n",
|
466 |
+
"ใฎใฏ\n",
|
467 |
+
"ใ\n",
|
468 |
+
"ไธ็\n",
|
469 |
+
"ใ\n",
|
470 |
+
"ๅ \n",
|
471 |
+
"ๆ\n",
|
472 |
+
"็\n",
|
473 |
+
"ใซ\n",
|
474 |
+
"ๆฑบ\n",
|
475 |
+
"ๅฎ\n",
|
476 |
+
"่ซ\n",
|
477 |
+
"็\n",
|
478 |
+
"ใซ\n",
|
479 |
+
"่\n",
|
480 |
+
"ใใ\n",
|
481 |
+
"ใใจ\n",
|
482 |
+
"ใง\n",
|
483 |
+
"ใใ\n",
|
484 |
+
"ใ\n",
|
485 |
+
"้ๅป\n",
|
486 |
+
"ใใ\n",
|
487 |
+
"่\n",
|
488 |
+
"ใใ\n",
|
489 |
+
"ใใจ\n",
|
490 |
+
"ใง\n",
|
491 |
+
"ใใ\n",
|
492 |
+
"ใ\n",
|
493 |
+
"ๆฉ\n",
|
494 |
+
"๏ฟฝ\n",
|
495 |
+
"๏ฟฝ\n",
|
496 |
+
"็\n",
|
497 |
+
"ใซ\n",
|
498 |
+
"่\n",
|
499 |
+
"ใใ\n",
|
500 |
+
"ใใจ\n",
|
501 |
+
"ใง\n",
|
502 |
+
"ใใ\n",
|
503 |
+
"ใ\n",
|
504 |
+
"<|endoftext|>\n"
|
505 |
+
]
|
506 |
+
}
|
507 |
+
],
|
508 |
+
"source": [
|
509 |
+
"idxs = processor.tokenizer(xb['labels']).input_ids\n",
|
510 |
+
"for idx in idxs:\n",
|
511 |
+
" print(processor.tokenizer.decode(idx))"
|
512 |
+
]
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"cell_type": "code",
|
516 |
+
"execution_count": 60,
|
517 |
+
"id": "d33cefc4",
|
518 |
+
"metadata": {},
|
519 |
+
"outputs": [
|
520 |
+
{
|
521 |
+
"data": {
|
522 |
+
"text/plain": [
|
523 |
+
"[ๅคใใ,\n",
|
524 |
+
" ไธ,\n",
|
525 |
+
" ใธ,\n",
|
526 |
+
" ใจ,\n",
|
527 |
+
" ใใ,\n",
|
528 |
+
" ใฎ,\n",
|
529 |
+
" ใฏ,\n",
|
530 |
+
" ใ,\n",
|
531 |
+
" ไธ็,\n",
|
532 |
+
" ใ,\n",
|
533 |
+
" ๅ ๆ,\n",
|
534 |
+
" ็,\n",
|
535 |
+
" ใซ,\n",
|
536 |
+
" ๆฑบๅฎ,\n",
|
537 |
+
" ่ซ,\n",
|
538 |
+
" ็,\n",
|
539 |
+
" ใซ,\n",
|
540 |
+
" ่ใใ,\n",
|
541 |
+
" ใใจ,\n",
|
542 |
+
" ใง,\n",
|
543 |
+
" ใใ,\n",
|
544 |
+
" ใ,\n",
|
545 |
+
" ้ๅป,\n",
|
546 |
+
" ใใ,\n",
|
547 |
+
" ่ใใ,\n",
|
548 |
+
" ใใจ,\n",
|
549 |
+
" ใง,\n",
|
550 |
+
" ใใ,\n",
|
551 |
+
" ใ,\n",
|
552 |
+
" ๆฉๆขฐ,\n",
|
553 |
+
" ็,\n",
|
554 |
+
" ใซ,\n",
|
555 |
+
" ่ใใ,\n",
|
556 |
+
" ใใจ,\n",
|
557 |
+
" ใง,\n",
|
558 |
+
" ใใ,\n",
|
559 |
+
" ใ]"
|
560 |
+
]
|
561 |
+
},
|
562 |
+
"execution_count": 60,
|
563 |
+
"metadata": {},
|
564 |
+
"output_type": "execute_result"
|
565 |
+
}
|
566 |
+
],
|
567 |
+
"source": [
|
568 |
+
"tagger(xb['labels'])"
|
569 |
+
]
|
570 |
+
},
|
571 |
+
{
|
572 |
+
"cell_type": "code",
|
573 |
+
"execution_count": 55,
|
574 |
+
"id": "2cbb82ef",
|
575 |
+
"metadata": {},
|
576 |
+
"outputs": [
|
577 |
+
{
|
578 |
+
"name": "stdout",
|
579 |
+
"output_type": "stream",
|
580 |
+
"text": [
|
581 |
+
"Help on method decode in module transformers.tokenization_utils_base:\n",
|
582 |
+
"\n",
|
583 |
+
"decode(token_ids: Union[int, List[int], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, **kwargs) -> str method of transformers.models.whisper.tokenization_whisper.WhisperTokenizer instance\n",
|
584 |
+
" Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special\n",
|
585 |
+
" tokens and clean up tokenization spaces.\n",
|
586 |
+
" \n",
|
587 |
+
" Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.\n",
|
588 |
+
" \n",
|
589 |
+
" Args:\n",
|
590 |
+
" token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):\n",
|
591 |
+
" List of tokenized input ids. Can be obtained using the `__call__` method.\n",
|
592 |
+
" skip_special_tokens (`bool`, *optional*, defaults to `False`):\n",
|
593 |
+
" Whether or not to remove special tokens in the decoding.\n",
|
594 |
+
" clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):\n",
|
595 |
+
" Whether or not to clean up the tokenization spaces.\n",
|
596 |
+
" kwargs (additional keyword arguments, *optional*):\n",
|
597 |
+
" Will be passed to the underlying model specific decode method.\n",
|
598 |
+
" \n",
|
599 |
+
" Returns:\n",
|
600 |
+
" `str`: The decoded sentence.\n",
|
601 |
+
"\n"
|
602 |
+
]
|
603 |
+
}
|
604 |
+
],
|
605 |
+
"source": [
|
606 |
+
"help(processor.tokenizer.decode)"
|
607 |
+
]
|
608 |
+
},
|
609 |
+
{
|
610 |
+
"cell_type": "code",
|
611 |
+
"execution_count": 41,
|
612 |
+
"id": "b4b9bbfc",
|
613 |
+
"metadata": {},
|
614 |
+
"outputs": [
|
615 |
+
{
|
616 |
+
"data": {
|
617 |
+
"text/plain": [
|
618 |
+
"'้บฉ ่ๅญ ใฏ ใ ้บฉ ใ ไธปๆ ๆ ใจ ใ ใ ๆฅๆฌ ใฎ ่ๅญ ใ'"
|
619 |
+
]
|
620 |
+
},
|
621 |
+
"execution_count": 41,
|
622 |
+
"metadata": {},
|
623 |
+
"output_type": "execute_result"
|
624 |
+
}
|
625 |
+
],
|
626 |
+
"source": [
|
627 |
+
"from fugashi import Tagger\n",
|
628 |
+
"\n",
|
629 |
+
"tagger = Tagger('-Owakati')\n",
|
630 |
+
"text = \"้บฉ่ๅญใฏใ้บฉใไธปๆๆใจใใๆฅๆฌใฎ่ๅญใ\"\n",
|
631 |
+
"tagger.parse(text)"
|
632 |
+
]
|
633 |
+
},
|
634 |
+
{
|
635 |
+
"cell_type": "code",
|
636 |
+
"execution_count": 43,
|
637 |
+
"id": "833ca62d",
|
638 |
+
"metadata": {},
|
639 |
+
"outputs": [
|
640 |
+
{
|
641 |
+
"data": {
|
642 |
+
"text/plain": [
|
643 |
+
"[้บฉ, ่ๅญ, ใฏ, ใ, ้บฉ, ใ, ไธปๆ, ๆ, ใจ, ใ, ใ, ๆฅๆฌ, ใฎ, ่ๅญ, ใ]"
|
644 |
+
]
|
645 |
+
},
|
646 |
+
"execution_count": 43,
|
647 |
+
"metadata": {},
|
648 |
+
"output_type": "execute_result"
|
649 |
+
}
|
650 |
+
],
|
651 |
+
"source": [
|
652 |
+
"tagger(text)"
|
653 |
+
]
|
654 |
+
},
|
655 |
{
|
656 |
"cell_type": "code",
|
657 |
"execution_count": null,
|
658 |
+
"id": "7b7854d6",
|
659 |
"metadata": {},
|
660 |
"outputs": [],
|
661 |
"source": [
|
662 |
+
"raw_datasets['']"
|
663 |
]
|
664 |
},
|
665 |
{
|
|
|
1134 |
"execution_count": 26,
|
1135 |
"id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
|
1136 |
"metadata": {
|
1137 |
+
"scrolled": true
|
1138 |
},
|
1139 |
"outputs": [
|
1140 |
{
|
|
|
1378 |
},
|
1379 |
{
|
1380 |
"cell_type": "code",
|
1381 |
+
"execution_count": 28,
|
1382 |
"id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
|
1383 |
"metadata": {},
|
1384 |
"outputs": [
|
|
|
1394 |
"Special tokens file saved in ./special_tokens_map.json\n",
|
1395 |
"added tokens file saved in ./added_tokens.json\n"
|
1396 |
]
|
1397 |
+
},
|
1398 |
+
{
|
1399 |
+
"data": {
|
1400 |
+
"application/vnd.jupyter.widget-view+json": {
|
1401 |
+
"model_id": "a47d7e61b9144723a4208cc4cc492eee",
|
1402 |
+
"version_major": 2,
|
1403 |
+
"version_minor": 0
|
1404 |
+
},
|
1405 |
+
"text/plain": [
|
1406 |
+
"Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]"
|
1407 |
+
]
|
1408 |
+
},
|
1409 |
+
"metadata": {},
|
1410 |
+
"output_type": "display_data"
|
1411 |
+
},
|
1412 |
+
{
|
1413 |
+
"data": {
|
1414 |
+
"application/vnd.jupyter.widget-view+json": {
|
1415 |
+
"model_id": "a7eb0d82c2fd4f978981915aa2314463",
|
1416 |
+
"version_major": 2,
|
1417 |
+
"version_minor": 0
|
1418 |
+
},
|
1419 |
+
"text/plain": [
|
1420 |
+
"Upload file runs/Dec12_04-37-47_150-136-44-233/events.out.tfevents.1670819878.150-136-44-233.69039.0: 100%|###โฆ"
|
1421 |
+
]
|
1422 |
+
},
|
1423 |
+
"metadata": {},
|
1424 |
+
"output_type": "display_data"
|
1425 |
+
},
|
1426 |
+
{
|
1427 |
+
"name": "stderr",
|
1428 |
+
"output_type": "stream",
|
1429 |
+
"text": [
|
1430 |
+
"remote: Scanning LFS files for validity, may be slow... \n",
|
1431 |
+
"remote: LFS file scan complete. \n",
|
1432 |
+
"To https://huggingface.co/kimbochen/whisper-small-jp\n",
|
1433 |
+
" d83a98f..0ff52f0 main -> main\n",
|
1434 |
+
"\n",
|
1435 |
+
"Dropping the following result as it does not have all the necessary fields:\n",
|
1436 |
+
"{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'dataset': {'name': 'Common Voice 11.0', 'type': 'mozilla-foundation/common_voice_11_0', 'config': 'ja', 'split': 'test', 'args': 'ja'}}\n",
|
1437 |
+
"To https://huggingface.co/kimbochen/whisper-small-jp\n",
|
1438 |
+
" 0ff52f0..22e3a01 main -> main\n",
|
1439 |
+
"\n"
|
1440 |
+
]
|
1441 |
+
},
|
1442 |
+
{
|
1443 |
+
"data": {
|
1444 |
+
"text/plain": [
|
1445 |
+
"'https://huggingface.co/kimbochen/whisper-small-jp/commit/0ff52f0f1d63daf816427096a83f7bbf8f3892eb'"
|
1446 |
+
]
|
1447 |
+
},
|
1448 |
+
"execution_count": 28,
|
1449 |
+
"metadata": {},
|
1450 |
+
"output_type": "execute_result"
|
1451 |
}
|
1452 |
],
|
1453 |
"source": [
|