Files changed (2) hide show
  1. .gitignore +3 -0
  2. distilbert_japanese_tokenizer.py +11 -172
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv
2
+ poetry.lock
3
+ pyproject.toml
distilbert_japanese_tokenizer.py CHANGED
@@ -29,11 +29,15 @@ from typing import Any, Dict, List, Optional, Tuple
29
  from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
30
  from transformers.utils import is_sentencepiece_available, logging
31
 
32
-
33
- if is_sentencepiece_available():
34
  import sentencepiece as spm
35
- else:
36
- spm = None
 
 
 
 
 
37
 
38
  logger = logging.get_logger(__name__)
39
 
@@ -436,7 +440,7 @@ class MecabTokenizer:
436
  do_lower_case=False,
437
  never_split=None,
438
  normalize_text=True,
439
- mecab_dic: Optional[str] = "ipadic",
440
  mecab_option: Optional[str] = None,
441
  ):
442
  """
@@ -450,7 +454,7 @@ class MecabTokenizer:
450
  [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
451
  **normalize_text**: (*optional*) boolean (default True)
452
  Whether to apply unicode normalization to text before tokenization.
453
- **mecab_dic**: (*optional*) string (default "ipadic")
454
  Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
455
  set this option to `None` and modify *mecab_option*.
456
  **mecab_option**: (*optional*) string
@@ -471,18 +475,7 @@ class MecabTokenizer:
471
  mecab_option = mecab_option or ""
472
 
473
  if mecab_dic is not None:
474
- if mecab_dic == "ipadic":
475
- try:
476
- import ipadic
477
- except ModuleNotFoundError as error:
478
- raise error.__class__(
479
- "The ipadic dictionary is not installed. "
480
- "See https://github.com/polm/ipadic-py for installation."
481
- )
482
-
483
- dic_dir = ipadic.DICDIR
484
-
485
- elif mecab_dic == "unidic_lite":
486
  try:
487
  import unidic_lite
488
  except ModuleNotFoundError as error:
@@ -492,7 +485,6 @@ class MecabTokenizer:
492
  )
493
 
494
  dic_dir = unidic_lite.DICDIR
495
-
496
  elif mecab_dic == "unidic":
497
  try:
498
  import unidic
@@ -536,159 +528,6 @@ class MecabTokenizer:
536
  return tokens
537
 
538
 
539
- class SudachiTokenizer:
540
- """Runs basic tokenization with Sudachi morphological parser."""
541
-
542
- def __init__(
543
- self,
544
- do_lower_case=False,
545
- never_split=None,
546
- normalize_text=True,
547
- trim_whitespace=False,
548
- sudachi_split_mode="A",
549
- sudachi_config_path=None,
550
- sudachi_resource_dir=None,
551
- sudachi_dict_type="core",
552
- ):
553
- """
554
- Constructs a SudachiTokenizer.
555
-
556
- Args:
557
- **do_lower_case**: (*optional*) boolean (default True)
558
- Whether to lowercase the input.
559
- **never_split**: (*optional*) list of str
560
- Kept for backward compatibility purposes. Now implemented directly at the base class level (see
561
- [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
562
- **normalize_text**: (*optional*) boolean (default True)
563
- Whether to apply unicode normalization to text before tokenization.
564
- **trim_whitespace**: (*optional*) boolean (default False)
565
- Whether to trim all whitespace, tab, newline from tokens.
566
- **sudachi_split_mode**: (*optional*) string
567
- Split mode of sudachi, choose from "A", "B", "C".
568
- **sudachi_config_path**: (*optional*) string
569
- **sudachi_resource_dir**: (*optional*) string
570
- **sudachi_dict_type**: (*optional*) string
571
- dict type of sudachi, choose from "small", "core", "full".
572
- """
573
-
574
- self.do_lower_case = do_lower_case
575
- self.never_split = never_split if never_split is not None else []
576
- self.normalize_text = normalize_text
577
- self.trim_whitespace = trim_whitespace
578
-
579
- try:
580
- from sudachipy import dictionary, tokenizer
581
- except ImportError:
582
- raise ImportError(
583
- "You need to install sudachipy to use SudachiTokenizer. "
584
- "See https://github.com/WorksApplications/SudachiPy for installation."
585
- )
586
-
587
- if sudachi_split_mode == "A":
588
- self.split_mode = tokenizer.Tokenizer.SplitMode.A
589
- elif sudachi_split_mode == "B":
590
- self.split_mode = tokenizer.Tokenizer.SplitMode.B
591
- elif sudachi_split_mode == "C":
592
- self.split_mode = tokenizer.Tokenizer.SplitMode.C
593
- else:
594
- raise ValueError("Invalid sudachi_split_mode is specified.")
595
-
596
- self.sudachi = dictionary.Dictionary(
597
- config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
598
- ).create(self.split_mode)
599
-
600
- def tokenize(self, text, never_split=None, **kwargs):
601
- """Tokenizes a piece of text."""
602
- if self.normalize_text:
603
- text = unicodedata.normalize("NFKC", text)
604
-
605
- never_split = self.never_split + (never_split if never_split is not None else [])
606
- tokens = []
607
-
608
- for word in self.sudachi.tokenize(text):
609
- token = word.surface()
610
-
611
- if self.do_lower_case and token not in never_split:
612
- token = token.lower()
613
-
614
- if self.trim_whitespace:
615
- if token.strip() == "":
616
- continue
617
- else:
618
- token = token.strip()
619
-
620
- tokens.append(token)
621
-
622
- return tokens
623
-
624
-
625
- class JumanppTokenizer:
626
- """Runs basic tokenization with jumanpp morphological parser."""
627
-
628
- def __init__(
629
- self,
630
- do_lower_case=False,
631
- never_split=None,
632
- normalize_text=True,
633
- trim_whitespace=False,
634
- ):
635
- """
636
- Constructs a JumanppTokenizer.
637
-
638
- Args:
639
- **do_lower_case**: (*optional*) boolean (default True)
640
- Whether to lowercase the input.
641
- **never_split**: (*optional*) list of str
642
- Kept for backward compatibility purposes. Now implemented directly at the base class level (see
643
- [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
644
- **normalize_text**: (*optional*) boolean (default True)
645
- Whether to apply unicode normalization to text before tokenization.
646
- **trim_whitespace**: (*optional*) boolean (default False)
647
- Whether to trim all whitespace, tab, newline from tokens.
648
- """
649
-
650
- self.do_lower_case = do_lower_case
651
- self.never_split = never_split if never_split is not None else []
652
- self.normalize_text = normalize_text
653
- self.trim_whitespace = trim_whitespace
654
-
655
- try:
656
- import rhoknp
657
- except ImportError:
658
- raise ImportError(
659
- "You need to install rhoknp to use JumanppTokenizer. "
660
- "See https://github.com/ku-nlp/rhoknp for installation."
661
- )
662
-
663
- self.juman = rhoknp.Jumanpp()
664
-
665
- def tokenize(self, text, never_split=None, **kwargs):
666
- """Tokenizes a piece of text."""
667
- if self.normalize_text:
668
- text = unicodedata.normalize("NFKC", text)
669
-
670
- text = text.strip()
671
-
672
- never_split = self.never_split + (never_split if never_split is not None else [])
673
- tokens = []
674
-
675
- for mrph in self.juman.apply_to_sentence(text).morphemes:
676
- token = mrph.text
677
-
678
- if self.do_lower_case and token not in never_split:
679
- token = token.lower()
680
-
681
- if self.trim_whitespace:
682
- if token.strip() == "":
683
- continue
684
- else:
685
- token = token.strip()
686
-
687
- tokens.append(token)
688
-
689
- return tokens
690
-
691
-
692
  class CharacterTokenizer:
693
  """Runs Character tokenization."""
694
 
 
29
  from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
30
  from transformers.utils import is_sentencepiece_available, logging
31
 
32
+ try:
 
33
  import sentencepiece as spm
34
+ except ModuleNotFoundError as error:
35
+ raise error.__class__(
36
+ "The sentencepiece is not installed. "
37
+ "See https://github.com/google/sentencepiece for installation."
38
+ )
39
+
40
+
41
 
42
  logger = logging.get_logger(__name__)
43
 
 
440
  do_lower_case=False,
441
  never_split=None,
442
  normalize_text=True,
443
+ mecab_dic: Optional[str] = "unidic_lite",
444
  mecab_option: Optional[str] = None,
445
  ):
446
  """
 
454
  [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
455
  **normalize_text**: (*optional*) boolean (default True)
456
  Whether to apply unicode normalization to text before tokenization.
457
+ **mecab_dic**: (*optional*) string (default "unidic_lite")
458
  Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
459
  set this option to `None` and modify *mecab_option*.
460
  **mecab_option**: (*optional*) string
 
475
  mecab_option = mecab_option or ""
476
 
477
  if mecab_dic is not None:
478
+ if mecab_dic == "unidic_lite":
 
 
 
 
 
 
 
 
 
 
 
479
  try:
480
  import unidic_lite
481
  except ModuleNotFoundError as error:
 
485
  )
486
 
487
  dic_dir = unidic_lite.DICDIR
 
488
  elif mecab_dic == "unidic":
489
  try:
490
  import unidic
 
528
  return tokens
529
 
530
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
  class CharacterTokenizer:
532
  """Runs Character tokenization."""
533