from os import PathLike from typing import Dict, List, Optional, Union from wenet.text.char_tokenizer import CharTokenizer from wenet.text.tokenize_utils import tokenize_by_bpe_model class BpeTokenizer(CharTokenizer): def __init__( self, bpe_model: Union[PathLike, str], symbol_table: Union[str, PathLike, Dict], non_lang_syms: Optional[Union[str, PathLike, List]] = None, split_with_space: bool = False, connect_symbol: str = '', unk='', ) -> None: super().__init__(symbol_table, non_lang_syms, split_with_space, connect_symbol, unk) self._model = bpe_model # NOTE(Mddct): multiprocessing.Process() issues # don't build sp here self.bpe_model = None def _build_sp(self): if self.bpe_model is None: import sentencepiece as spm self.bpe_model = spm.SentencePieceProcessor() self.bpe_model.load(self._model) def text2tokens(self, line: str) -> List[str]: self._build_sp() line = line.strip() if self.non_lang_syms_pattern is not None: parts = self.non_lang_syms_pattern.split(line.upper()) parts = [w for w in parts if len(w.strip()) > 0] else: parts = [line] tokens = [] for part in parts: if part in self.non_lang_syms: tokens.append(part) else: tokens.extend(tokenize_by_bpe_model(self.bpe_model, part)) return tokens def tokens2text(self, tokens: List[str]) -> str: self._build_sp() text = super().tokens2text(tokens) return text.replace("▁", ' ').strip()