Spaces:

hon9kon9ize
/

tts

Running

App Files Files Community

indiejoseph commited on Jul 29, 2024

Commit

af3d42a

0 Parent(s):

first commit

Browse files

Files changed (27) hide show

.gitignore +6 -0
app.py +165 -0
bert/bert-large-cantonese/README.md +82 -0
bert/bert-large-cantonese/added_tokens.json +502 -0
bert/bert-large-cantonese/config.json +24 -0
bert/bert-large-cantonese/generation_config.json +5 -0
bert/bert-large-cantonese/special_tokens_map.json +37 -0
bert/bert-large-cantonese/tokenizer.json +0 -0
bert/bert-large-cantonese/tokenizer_config.json +4062 -0
bert/bert_models.json +15 -0
bert/deberta-v3-large/.gitattributes +27 -0
bert/deberta-v3-large/README.md +93 -0
bert/deberta-v3-large/config.json +22 -0
bert/deberta-v3-large/generator_config.json +22 -0
bert/deberta-v3-large/spm.model +3 -0
bert/deberta-v3-large/tokenizer_config.json +4 -0
infer.py +135 -0
requirements.txt +6 -0
text/__init__.py +30 -0
text/cantonese.py +273 -0
text/cantonese_bert.py +121 -0
text/cleaner.py +28 -0
text/english.py +494 -0
text/english_bert_mock.py +58 -0
text/jyutping.csv +0 -0
text/symbols.py +152 -0
text/yue_dict.txt +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+pytorch_model.bin
+onnx/*
+.cache
+cmudict.rep
+cmudict_cache.pickle

app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from infer import OnnxInferenceSession
+from text import cleaned_text_to_sequence, get_bert
+from text.cleaner import clean_text
+import numpy as np
+from huggingface_hub import hf_hub_download
+import asyncio
+from pathlib import Path
+OnnxSession = None
+models = [
+    {
+        "local_path": "./bert/bert-large-cantonese",
+        "repo_id": "hon9kon9ize/bert-large-cantonese",
+        "files": [
+            "pytorch_model.bin"
+        ]
+    },
+    {
+        "local_path": "./bert/deberta-v3-large",
+        "repo_id": "microsoft/deberta-v3-large",
+        "files": [
+            "spm.model",
+            "pytorch_model.bin"
+        ]
+    },
+    {
+        "local_path": "./onnx",
+        "repo_id": "hon9kon9ize/bert-vits-zoengjyutgaai-onnx",
+        "files": [
+            "BertVits2.2PT.json",
+            "BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
+            "BertVits2.2PT/BertVits2.2PT_emb.onnx",
+            "BertVits2.2PT/BertVits2.2PT_dp.onnx",
+            "BertVits2.2PT/BertVits2.2PT_sdp.onnx",
+            "BertVits2.2PT/BertVits2.2PT_flow.onnx",
+            "BertVits2.2PT/BertVits2.2PT_dec.onnx"
+        ]
+    }
+]
+def get_onnx_session():
+    global OnnxSession
+    if OnnxSession is not None:
+        return OnnxSession
+    OnnxSession = OnnxInferenceSession(
+        {
+            "enc": "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
+            "emb_g": "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx",
+            "dp": "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx",
+            "sdp": "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx",
+            "flow": "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx",
+            "dec": "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx",
+        },
+        Providers=["CPUExecutionProvider"],
+    )
+    return OnnxSession
+def download_model_files(repo_id, files, local_path):
+    for file in files:
+        if not Path(local_path).joinpath(file).exists():
+            hf_hub_download(
+                repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
+            )
+def download_models():
+    for data in models:
+        download_model_files(data["repo_id"], data["files"], data["local_path"])
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+def get_text(text, language_str, style_text=None, style_weight=0.7):
+    style_text = None if style_text == "" else style_text
+    # 在此处实现当前版本的get_text
+    norm_text, phone, tone, word2ph = clean_text(text, language_str)
+    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    # add blank
+    phone = intersperse(phone, 0)
+    tone = intersperse(tone, 0)
+    language = intersperse(language, 0)
+    for i in range(len(word2ph)):
+        word2ph[i] = word2ph[i] * 2
+    word2ph[0] += 1
+    bert_ori = get_bert(
+        norm_text, word2ph, language_str, "cpu", style_text, style_weight
+    )
+    del word2ph
+    assert bert_ori.shape[-1] == len(phone), phone
+    if language_str == "EN":
+        en_bert = bert_ori
+        yue_bert = np.random.randn(1024, len(phone))
+    elif language_str == "YUE":
+        en_bert = np.random.randn(1024, len(phone))
+        yue_bert = bert_ori
+    else:
+        raise ValueError("language_str should be EN or YUE")
+    assert yue_bert.shape[-1] == len(
+        phone
+    ), f"Bert seq len {yue_bert.shape[-1]} != {len(phone)}"
+    phone = np.asarray(phone)
+    tone = np.asarray(tone)
+    language = np.asarray(language)
+    en_bert = np.asarray(en_bert.T)
+    yue_bert = np.asarray(yue_bert.T)
+    return en_bert, yue_bert, phone, tone, language
+# Text-to-speech function
+async def text_to_speech(text, sid=0, language="YUE"):
+    Session = get_onnx_session()
+    if not text.strip():
+        return None, gr.Warning("Please enter text to convert.")
+    en_bert, yue_bert, x, tone, language = get_text(text, language)
+    sid = np.array([sid])
+    audio = Session(x, tone, language, en_bert, yue_bert, sid)
+    return audio[0][0]
+# Create Gradio application
+import gradio as gr
+# Gradio interface function
+def tts_interface(text):
+    audio = asyncio.run(text_to_speech(text, 0, "YUE"))
+    return 44100, audio
+async def create_demo():
+    description = """廣東話語音生成器，基於Bert-VITS2模型
+注意：model 本身支持廣東話同英文，但呢個 space 未實現中英夾雜生成。
+"""
+    demo = gr.Interface(
+        fn=tts_interface,
+        inputs=[
+            gr.Textbox(label="Input Text", lines=5),
+        ],
+        outputs=[
+            gr.Audio(label="Generated Audio"),
+        ],
+        title="Cantonese TTS Text-to-Speech",
+        description=description,
+        analytics_enabled=False,
+        allow_flagging=False
+    )
+    return demo
+# Run the application
+if __name__ == "__main__":
+    download_models()
+    demo = asyncio.run(create_demo())
+    demo.launch()

bert/bert-large-cantonese/README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+---
+library_name: transformers
+language:
+  - yue
+license: cc-by-4.0
+tags:
+  - generated_from_trainer
+pipeline_tag: fill-mask
+widget:
+  - text: 香港原本[MASK]一個人煙稀少嘅漁港。
+    example_title: 係
+model-index:
+  - name: bert-large-cantonese
+    results: []
+---
+# bert-large-cantonese
+## Description
+This model is tranied from scratch on Cantonese text. It is a BERT model with a large architecture (24-layer, 1024-hidden, 16-heads, 326M parameters).
+The first training stage is to pre-train the model on 128 length sequences with a batch size of 512 for 1 epoch. the second stage is to continued pre-train the model on 512 length sequences with a batch size of 512 for one more epoch.
+## How to use
+You can use this model directly with a pipeline for masked language modeling:
+```python
+from transformers import pipeline
+mask_filler = pipeline(
+    "fill-mask",
+    model="hon9kon9ize/bert-large-cantonese"
+)
+mask_filler("雞蛋六隻，糖呢就兩茶匙，仲有[MASK]橙皮添。")
+; [{'score': 0.08160534501075745,
+;   'token': 943,
+;   'token_str': '個',
+;   'sequence': '雞 蛋 六 隻 ， 糖 呢 就 兩 茶 匙 ， 仲 有 個 橙 皮 添 。'},
+;  {'score': 0.06182105466723442,
+;   'token': 1576,
+;   'token_str': '啲',
+;   'sequence': '雞 蛋 六 隻 ， 糖 呢 就 兩 茶 匙 ， 仲 有 啲 橙 皮 添 。'},
+;  {'score': 0.04600336775183678,
+;   'token': 1646,
+;   'token_str': '嘅',
+;   'sequence': '雞 蛋 六 隻 ， 糖 呢 就 兩 茶 匙 ， 仲 有 嘅 橙 皮 添 。'},
+;  {'score': 0.03743772581219673,
+;   'token': 3581,
+;   'token_str': '橙',
+;   'sequence': '雞 蛋 六 隻 ， 糖 呢 就 兩 茶 匙 ， 仲 有 橙 橙 皮 添 。'},
+;  {'score': 0.031560592353343964,
+;   'token': 5148,
+;   'token_str': '紅',
+;   'sequence': '雞 蛋 六 隻 ， 糖 呢 就 兩 茶 匙 ， 仲 有 紅 橙 皮 添 。'}]
+```
+## Training hyperparameters
+The following hyperparameters were used during first training:
+- Batch size: 512
+- Learning rate: 1e-4
+- Learning rate scheduler: linear decay
+- 1 Epoch
+- Warmup ratio: 0.1
+Loss plot on [WanDB](https://api.wandb.ai/links/indiejoseph/v3ljlpmp)
+The following hyperparameters were used during second training:
+- Batch size: 512
+- Learning rate: 5e-5
+- Learning rate scheduler: linear decay
+- 1 Epoch
+- Warmup ratio: 0.1
+Loss plot on [WanDB](https://api.wandb.ai/links/indiejoseph/vcm3q1ef)

bert/bert-large-cantonese/added_tokens.json ADDED Viewed

	@@ -0,0 +1,502 @@

+{
+  "㔷": 21620,
+  "㖭": 21330,
+  "㚻": 21255,
+  "㞗": 21216,
+  "㞘": 21384,
+  "㦸": 21493,
+  "㨂": 21307,
+  "㩒": 21182,
+  "㴓": 21485,
+  "㷫": 21265,
+  "乸": 21143,
+  "仼": 21501,
+  "佮": 21234,
+  "侘": 21537,
+  "偲": 21220,
+  "僆": 21421,
+  "僞": 21471,
+  "僳": 21564,
+  "儁": 21422,
+  "儍": 21388,
+  "兗": 21368,
+  "冚": 21138,
+  "冧": 21137,
+  "凖": 21454,
+  "勷": 21522,
+  "卌": 21284,
+  "卽": 21186,
+  "厏": 21439,
+  "厓": 21449,
+  "厠": 21256,
+  "厹": 21285,
+  "吔": 21205,
+  "吲": 21403,
+  "吿": 21547,
+  "呑": 21331,
+  "呔": 21204,
+  "咃": 21533,
+  "咇": 21300,
+  "咼": 21565,
+  "哚": 21376,
+  "唂": 21402,
+  "唒": 21250,
+  "唓": 21401,
+  "唞": 21175,
+  "唥": 21144,
+  "唨": 21159,
+  "唪": 21146,
+  "唻": 21223,
+  "啋": 21428,
+  "啩": 21178,
+  "啹": 21482,
+  "喐": 21165,
+  "喥": 21316,
+  "喼": 21192,
+  "嗌": 21129,
+  "嗮": 21130,
+  "嗱": 21145,
+  "嘥": 21151,
+  "噃": 21197,
+  "噉": 21128,
+  "噏": 21170,
+  "噚": 21135,
+  "嚙": 21282,
+  "嚡": 21236,
+  "嚦": 21455,
+  "嚫": 21346,
+  "嚹": 21158,
+  "嚿": 21134,
+  "囇": 21612,
+  "囖": 21140,
+  "囘": 21504,
+  "坭": 21315,
+  "垻": 21538,
+  "埐": 21294,
+  "埞": 21180,
+  "埲": 21288,
+  "堊": 21309,
+  "塡": 21511,
+  "塱": 21187,
+  "塲": 21445,
+  "塹": 21481,
+  "奀": 21306,
+  "奭": 21492,
+  "妺": 21465,
+  "姵": 21536,
+  "娸": 21569,
+  "媺": 21431,
+  "嫗": 21311,
+  "嫰": 21323,
+  "嬋": 21400,
+  "嬲": 21131,
+  "孭": 21179,
+  "孲": 21210,
+  "孻": 21264,
+  "尐": 21157,
+  "尙": 21520,
+  "尢": 21619,
+  "屘": 21484,
+  "屙": 21160,
+  "岃": 21392,
+  "嶠": 21267,
+  "幗": 21269,
+  "幪": 21279,
+  "廡": 21530,
+  "廸": 21217,
+  "廻": 21479,
+  "彊": 21446,
+  "彖": 21335,
+  "徂": 21155,
+  "忟": 21301,
+  "惗": 21353,
+  "愃": 21527,
+  "愨": 21562,
+  "慇": 21603,
+  "慤": 21389,
+  "憓": 21477,
+  "戇": 21181,
+  "戙": 21281,
+  "戥": 21162,
+  "扤": 21541,
+  "扲": 21549,
+  "扺": 21293,
+  "抆": 21266,
+  "抌": 21258,
+  "抺": 21238,
+  "拃": 21188,
+  "拏": 21271,
+  "拕": 21476,
+  "挐": 21524,
+  "捽": 21212,
+  "掕": 21166,
+  "掗": 21486,
+  "掟": 21153,
+  "掹": 21214,
+  "揈": 21251,
+  "揞": 21429,
+  "揦": 21371,
+  "揼": 21184,
+  "揾": 21132,
+  "搣": 21222,
+  "搦": 21383,
+  "搲": 21317,
+  "搾": 21398,
+  "摑": 21268,
+  "摱": 21438,
+  "摷": 21209,
+  "撘": 21224,
+  "撣": 21615,
+  "撳": 21141,
+  "撾": 21183,
+  "擗": 21589,
+  "擧": 21521,
+  "擸": 21334,
+  "攆": 21544,
+  "攰": 21139,
+  "攷": 21270,
+  "旚": 21582,
+  "旯": 21280,
+  "旼": 21399,
+  "昃": 21483,
+  "昅": 21528,
+  "昪": 21377,
+  "昰": 21459,
+  "昺": 21380,
+  "暎": 21558,
+  "暪": 21437,
+  "曱": 21185,
+  "朏": 21557,
+  "朳": 21572,
+  "柙": 21551,
+  "栢": 21193,
+  "栱": 21581,
+  "梘": 21219,
+  "椏": 21385,
+  "椗": 21618,
+  "榘": 21560,
+  "榚": 21369,
+  "樋": 21601,
+  "樖": 21150,
+  "樨": 21475,
+  "樴": 21413,
+  "橛": 21156,
+  "檠": 21272,
+  "櫈": 21173,
+  "櫟": 21516,
+  "櫳": 21215,
+  "欏": 21500,
+  "殮": 21295,
+  "殻": 21207,
+  "氘": 21616,
+  "氚": 21574,
+  "氬": 21447,
+  "氼": 21329,
+  "沊": 21509,
+  "沔": 21552,
+  "沚": 21490,
+  "泂": 21461,
+  "涷": 21340,
+  "淥": 21235,
+  "淸": 21363,
+  "湉": 21443,
+  "湞": 21626,
+  "湴": 21407,
+  "滘": 21161,
+  "漖": 21627,
+  "潁": 21396,
+  "潯": 21241,
+  "澌": 21292,
+  "濰": 21394,
+  "濶": 21468,
+  "瀡": 21435,
+  "瀦": 21535,
+  "灃": 21625,
+  "灕": 21420,
+  "炆": 21172,
+  "炑": 21474,
+  "炘": 21621,
+  "烚": 21189,
+  "烴": 21229,
+  "焫": 21248,
+  "煇": 21227,
+  "煬": 21247,
+  "煱": 21347,
+  "燶": 21163,
+  "燾": 21386,
+  "牀": 21168,
+  "牘": 21600,
+  "猁": 21226,
+  "猢": 21609,
+  "猻": 21540,
+  "獌": 21198,
+  "獴": 21415,
+  "珓": 21370,
+  "琚": 21597,
+  "琤": 21393,
+  "琿": 21494,
+  "瑂": 21423,
+  "瑭": 21573,
+  "璘": 21555,
+  "璠": 21240,
+  "璣": 21299,
+  "璦": 21556,
+  "璩": 21508,
+  "瓘": 21554,
+  "瓚": 21318,
+  "甂": 21457,
+  "甑": 21539,
+  "甴": 21190,
+  "畧": 21322,
+  "畵": 21416,
+  "疎": 21460,
+  "疴": 21338,
+  "痲": 21576,
+  "痾": 21164,
+  "癆": 21503,
+  "癈": 21333,
+  "癗": 21433,
+  "癦": 21610,
+  "癩": 21410,
+  "睺": 21296,
+  "砬": 21568,
+  "砵": 21194,
+  "硃": 21570,
+  "硏": 21342,
+  "硤": 21201,
+  "礮": 21375,
+  "祆": 21472,
+  "祼": 21417,
+  "禕": 21542,
+  "禰": 21514,
+  "稈": 21367,
+  "穏": 21341,
+  "窰": 21230,
+  "竈": 21286,
+  "竉": 21289,
+  "竪": 21550,
+  "笪": 21147,
+  "筧": 21605,
+  "篋": 21359,
+  "簋": 21277,
+  "簒": 21418,
+  "粢": 21586,
+  "糉": 21263,
+  "糭": 21253,
+  "糴": 21425,
+  "紇": 21470,
+  "紥": 21252,
+  "綉": 21575,
+  "綟": 21260,
+  "綣": 21512,
+  "綷": 21441,
+  "緡": 21245,
+  "緲": 21517,
+  "縉": 21297,
+  "縹": 21587,
+  "繑": 21448,
+  "繙": 21246,
+  "缐": 21553,
+  "罅": 21191,
+  "罉": 21430,
+  "罟": 21324,
+  "羕": 21507,
+  "羶": 21378,
+  "翕": 21456,
+  "耖": 21390,
+  "肶": 21351,
+  "胐": 21332,
+  "脧": 21303,
+  "脷": 21148,
+  "腍": 21167,
+  "膥": 21228,
+  "膶": 21257,
+  "臏": 21566,
+  "舘": 21374,
+  "舢": 21563,
+  "舨": 21592,
+  "艶": 21593,
+  "苺": 21488,
+  "茘": 21624,
+  "菴": 21312,
+  "蒴": 21343,
+  "蓀": 21458,
+  "蔴": 21177,
+  "蕓": 21518,
+  "藪": 21302,
+  "蘄": 21613,
+  "蘅": 21478,
+  "蚧": 21579,
+  "蛺": 21391,
+  "蜑": 21358,
+  "蝻": 21259,
+  "螈": 21291,
+  "蟈": 21419,
+  "蟧": 21360,
+  "蟶": 21233,
+  "蠄": 21326,
+  "蠏": 21467,
+  "蠑": 21328,
+  "衊": 21469,
+  "裇": 21304,
+  "褦": 21221,
+  "褸": 21171,
+  "覈": 21290,
+  "覲": 21453,
+  "觜": 21496,
+  "訃": 21571,
+  "訌": 21412,
+  "訢": 21466,
+  "詏": 21244,
+  "詒": 21531,
+  "誒": 21152,
+  "謖": 21473,
+  "謚": 21237,
+  "謳": 21278,
+  "谿": 21580,
+  "豕": 21491,
+  "趷": 21424,
+  "跣": 21206,
+  "踎": 21202,
+  "踭": 21203,
+  "踼": 21404,
+  "躂": 21426,
+  "躄": 21195,
+  "躝": 21274,
+  "軚": 21196,
+  "軛": 21357,
+  "軫": 21349,
+  "軭": 21497,
+  "軻": 21434,
+  "輋": 21239,
+  "迾": 21325,
+  "逄": 21594,
+  "逑": 21611,
+  "逳": 21211,
+  "邴": 21310,
+  "郃": 21604,
+  "鄕": 21406,
+  "鄴": 21287,
+  "酎": 21546,
+  "釙": 21450,
+  "鈷": 21548,
+  "鈹": 21545,
+  "鉍": 21584,
+  "鉞": 21525,
+  "鉬": 21588,
+  "鉸": 21308,
+  "鉼": 21387,
+  "銥": 21462,
+  "銨": 21365,
+  "銫": 21440,
+  "銻": 21532,
+  "銼": 21432,
+  "鋇": 21614,
+  "鋯": 21590,
+  "錀": 21499,
+  "錒": 21498,
+  "錕": 21372,
+  "錡": 21199,
+  "鍔": 21515,
+  "鍚": 21273,
+  "鍠": 21261,
+  "鍬": 21379,
+  "鍶": 21344,
+  "鎅": 21339,
+  "鎘": 21529,
+  "鎢": 21362,
+  "鏇": 21397,
+  "鏐": 21585,
+  "鏝": 21218,
+  "鏵": 21444,
+  "鏹": 21502,
+  "鐡": 21505,
+  "鑌": 21452,
+  "鑭": 21567,
+  "閂": 21154,
+  "閆": 21463,
+  "閙": 21366,
+  "閤": 21381,
+  "閪": 21200,
+  "閭": 21409,
+  "闐": 21591,
+  "闓": 21442,
+  "靑": 21405,
+  "靭": 21262,
+  "靱": 21599,
+  "韃": 21242,
+  "韞": 21354,
+  "韮": 21254,
+  "頊": 21348,
+  "頴": 21327,
+  "顓": 21427,
+  "顥": 21411,
+  "顳": 21319,
+  "飮": 21561,
+  "餬": 21276,
+  "餸": 21133,
+  "饀": 21414,
+  "饉": 21534,
+  "馱": 21480,
+  "駖": 21356,
+  "駙": 21617,
+  "駟": 21489,
+  "駡": 21598,
+  "騫": 21314,
+  "騭": 21364,
+  "騮": 21174,
+  "騾": 21487,
+  "驃": 21336,
+  "驄": 21337,
+  "骹": 21355,
+  "髀": 21142,
+  "髹": 21350,
+  "鬅": 21213,
+  "鬈": 21408,
+  "鬩": 21543,
+  "鬭": 21495,
+  "鬲": 21320,
+  "魨": 21464,
+  "鮋": 21596,
+  "鮟": 21232,
+  "鮫": 21352,
+  "鯇": 21305,
+  "鯡": 21623,
+  "鯪": 21243,
+  "鯭": 21382,
+  "鰂": 21169,
+  "鰹": 21345,
+  "鱇": 21231,
+  "鱘": 21395,
+  "鱟": 21583,
+  "鱲": 21208,
+  "鳯": 21451,
+  "鴞": 21275,
+  "鴣": 21607,
+  "鴴": 21622,
+  "鵐": 21321,
+  "鵞": 21510,
+  "鵪": 21283,
+  "鶉": 21249,
+  "鶻": 21608,
+  "鶿": 21559,
+  "鷂": 21298,
+  "鷄": 21176,
+  "鷓": 21606,
+  "鷸": 21373,
+  "鸕": 21526,
+  "鸛": 21361,
+  "麪": 21149,
+  "麫": 21577,
+  "麿": 21225,
+  "黐": 21136,
+  "鼆": 21313,
+  "鼇": 21602,
+  "鼴": 21436,
+  "鼷": 21506,
+  "齲": 21519,
+  "齶": 21578,
+  "龑": 21523,
+  "龠": 21595,
+  "龢": 21513
+}

bert/bert-large-cantonese/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21628
+}

bert/bert-large-cantonese/generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.1"
+}

bert/bert-large-cantonese/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

bert/bert-large-cantonese/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert/bert-large-cantonese/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4062 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21128": {
+      "content": "噉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21129": {
+      "content": "嗌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21130": {
+      "content": "嗮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21131": {
+      "content": "嬲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21132": {
+      "content": "揾",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21133": {
+      "content": "餸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21134": {
+      "content": "嚿",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21135": {
+      "content": "噚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21136": {
+      "content": "黐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21137": {
+      "content": "冧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21138": {
+      "content": "冚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21139": {
+      "content": "攰",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21140": {
+      "content": "囖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21141": {
+      "content": "撳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21142": {
+      "content": "髀",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21143": {
+      "content": "乸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21144": {
+      "content": "唥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21145": {
+      "content": "嗱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21146": {
+      "content": "唪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21147": {
+      "content": "笪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21148": {
+      "content": "脷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21149": {
+      "content": "麪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21150": {
+      "content": "樖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21151": {
+      "content": "嘥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21152": {
+      "content": "誒",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21153": {
+      "content": "掟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21154": {
+      "content": "閂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21155": {
+      "content": "徂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21156": {
+      "content": "橛",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21157": {
+      "content": "尐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21158": {
+      "content": "嚹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21159": {
+      "content": "唨",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21160": {
+      "content": "屙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21161": {
+      "content": "滘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21162": {
+      "content": "戥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21163": {
+      "content": "燶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21164": {
+      "content": "痾",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21165": {
+      "content": "喐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21166": {
+      "content": "掕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21167": {
+      "content": "腍",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21168": {
+      "content": "牀",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21169": {
+      "content": "鰂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21170": {
+      "content": "噏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21171": {
+      "content": "褸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21172": {
+      "content": "炆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21173": {
+      "content": "櫈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21174": {
+      "content": "騮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21175": {
+      "content": "唞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21176": {
+      "content": "鷄",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21177": {
+      "content": "蔴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21178": {
+      "content": "啩",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21179": {
+      "content": "孭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21180": {
+      "content": "埞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21181": {
+      "content": "戇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21182": {
+      "content": "㩒",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21183": {
+      "content": "撾",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21184": {
+      "content": "揼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21185": {
+      "content": "曱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21186": {
+      "content": "卽",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21187": {
+      "content": "塱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21188": {
+      "content": "拃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21189": {
+      "content": "烚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21190": {
+      "content": "甴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21191": {
+      "content": "罅",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21192": {
+      "content": "喼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21193": {
+      "content": "栢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21194": {
+      "content": "砵",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21195": {
+      "content": "躄",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21196": {
+      "content": "軚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21197": {
+      "content": "噃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21198": {
+      "content": "獌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21199": {
+      "content": "錡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21200": {
+      "content": "閪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21201": {
+      "content": "硤",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21202": {
+      "content": "踎",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21203": {
+      "content": "踭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21204": {
+      "content": "呔",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21205": {
+      "content": "吔",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21206": {
+      "content": "跣",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21207": {
+      "content": "殻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21208": {
+      "content": "鱲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21209": {
+      "content": "摷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21210": {
+      "content": "孲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21211": {
+      "content": "逳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21212": {
+      "content": "捽",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21213": {
+      "content": "鬅",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21214": {
+      "content": "掹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21215": {
+      "content": "櫳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21216": {
+      "content": "㞗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21217": {
+      "content": "廸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21218": {
+      "content": "鏝",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21219": {
+      "content": "梘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21220": {
+      "content": "偲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21221": {
+      "content": "褦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21222": {
+      "content": "搣",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21223": {
+      "content": "唻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21224": {
+      "content": "撘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21225": {
+      "content": "麿",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21226": {
+      "content": "猁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21227": {
+      "content": "煇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21228": {
+      "content": "膥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21229": {
+      "content": "烴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21230": {
+      "content": "窰",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21231": {
+      "content": "鱇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21232": {
+      "content": "鮟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21233": {
+      "content": "蟶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21234": {
+      "content": "佮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21235": {
+      "content": "淥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21236": {
+      "content": "嚡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21237": {
+      "content": "謚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21238": {
+      "content": "抺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21239": {
+      "content": "輋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21240": {
+      "content": "璠",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21241": {
+      "content": "潯",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21242": {
+      "content": "韃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21243": {
+      "content": "鯪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21244": {
+      "content": "詏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21245": {
+      "content": "緡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21246": {
+      "content": "繙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21247": {
+      "content": "煬",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21248": {
+      "content": "焫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21249": {
+      "content": "鶉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21250": {
+      "content": "唒",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21251": {
+      "content": "揈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21252": {
+      "content": "紥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21253": {
+      "content": "糭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21254": {
+      "content": "韮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21255": {
+      "content": "㚻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21256": {
+      "content": "厠",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21257": {
+      "content": "膶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21258": {
+      "content": "抌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21259": {
+      "content": "蝻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21260": {
+      "content": "綟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21261": {
+      "content": "鍠",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21262": {
+      "content": "靭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21263": {
+      "content": "糉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21264": {
+      "content": "孻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21265": {
+      "content": "㷫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21266": {
+      "content": "抆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21267": {
+      "content": "嶠",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21268": {
+      "content": "摑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21269": {
+      "content": "幗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21270": {
+      "content": "攷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21271": {
+      "content": "拏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21272": {
+      "content": "檠",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21273": {
+      "content": "鍚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21274": {
+      "content": "躝",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21275": {
+      "content": "鴞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21276": {
+      "content": "餬",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21277": {
+      "content": "簋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21278": {
+      "content": "謳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21279": {
+      "content": "幪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21280": {
+      "content": "旯",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21281": {
+      "content": "戙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21282": {
+      "content": "嚙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21283": {
+      "content": "鵪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21284": {
+      "content": "卌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21285": {
+      "content": "厹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21286": {
+      "content": "竈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21287": {
+      "content": "鄴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21288": {
+      "content": "埲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21289": {
+      "content": "竉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21290": {
+      "content": "覈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21291": {
+      "content": "螈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21292": {
+      "content": "澌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21293": {
+      "content": "扺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21294": {
+      "content": "埐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21295": {
+      "content": "殮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21296": {
+      "content": "睺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21297": {
+      "content": "縉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21298": {
+      "content": "鷂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21299": {
+      "content": "璣",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21300": {
+      "content": "咇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21301": {
+      "content": "忟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21302": {
+      "content": "藪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21303": {
+      "content": "脧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21304": {
+      "content": "裇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21305": {
+      "content": "鯇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21306": {
+      "content": "奀",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21307": {
+      "content": "㨂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21308": {
+      "content": "鉸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21309": {
+      "content": "堊",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21310": {
+      "content": "邴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21311": {
+      "content": "嫗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21312": {
+      "content": "菴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21313": {
+      "content": "鼆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21314": {
+      "content": "騫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21315": {
+      "content": "坭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21316": {
+      "content": "喥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21317": {
+      "content": "搲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21318": {
+      "content": "瓚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21319": {
+      "content": "顳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21320": {
+      "content": "鬲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21321": {
+      "content": "鵐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21322": {
+      "content": "畧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21323": {
+      "content": "嫰",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21324": {
+      "content": "罟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21325": {
+      "content": "���",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21326": {
+      "content": "蠄",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21327": {
+      "content": "頴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21328": {
+      "content": "蠑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21329": {
+      "content": "氼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21330": {
+      "content": "㖭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21331": {
+      "content": "呑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21332": {
+      "content": "胐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21333": {
+      "content": "癈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21334": {
+      "content": "擸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21335": {
+      "content": "彖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21336": {
+      "content": "驃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21337": {
+      "content": "驄",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21338": {
+      "content": "疴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21339": {
+      "content": "鎅",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21340": {
+      "content": "涷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21341": {
+      "content": "穏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21342": {
+      "content": "硏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21343": {
+      "content": "蒴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21344": {
+      "content": "鍶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21345": {
+      "content": "鰹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21346": {
+      "content": "嚫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21347": {
+      "content": "煱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21348": {
+      "content": "頊",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21349": {
+      "content": "軫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21350": {
+      "content": "髹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21351": {
+      "content": "肶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21352": {
+      "content": "鮫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21353": {
+      "content": "惗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21354": {
+      "content": "韞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21355": {
+      "content": "骹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21356": {
+      "content": "駖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21357": {
+      "content": "軛",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21358": {
+      "content": "蜑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21359": {
+      "content": "篋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21360": {
+      "content": "蟧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21361": {
+      "content": "鸛",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21362": {
+      "content": "鎢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21363": {
+      "content": "淸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21364": {
+      "content": "騭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21365": {
+      "content": "銨",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21366": {
+      "content": "閙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21367": {
+      "content": "稈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21368": {
+      "content": "兗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21369": {
+      "content": "榚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21370": {
+      "content": "珓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21371": {
+      "content": "揦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21372": {
+      "content": "錕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21373": {
+      "content": "鷸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21374": {
+      "content": "舘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21375": {
+      "content": "礮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21376": {
+      "content": "哚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21377": {
+      "content": "昪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21378": {
+      "content": "羶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21379": {
+      "content": "鍬",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21380": {
+      "content": "昺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21381": {
+      "content": "閤",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21382": {
+      "content": "鯭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21383": {
+      "content": "搦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21384": {
+      "content": "㞘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21385": {
+      "content": "椏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21386": {
+      "content": "燾",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21387": {
+      "content": "鉼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21388": {
+      "content": "儍",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21389": {
+      "content": "慤",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21390": {
+      "content": "耖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21391": {
+      "content": "蛺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21392": {
+      "content": "岃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21393": {
+      "content": "琤",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21394": {
+      "content": "濰",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21395": {
+      "content": "鱘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21396": {
+      "content": "潁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21397": {
+      "content": "鏇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21398": {
+      "content": "搾",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21399": {
+      "content": "旼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21400": {
+      "content": "嬋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21401": {
+      "content": "唓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21402": {
+      "content": "唂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21403": {
+      "content": "吲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21404": {
+      "content": "踼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21405": {
+      "content": "靑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21406": {
+      "content": "鄕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21407": {
+      "content": "湴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21408": {
+      "content": "鬈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21409": {
+      "content": "閭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21410": {
+      "content": "癩",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21411": {
+      "content": "顥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21412": {
+      "content": "訌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21413": {
+      "content": "樴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21414": {
+      "content": "饀",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21415": {
+      "content": "獴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21416": {
+      "content": "畵",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21417": {
+      "content": "祼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21418": {
+      "content": "簒",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21419": {
+      "content": "蟈",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21420": {
+      "content": "灕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21421": {
+      "content": "僆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21422": {
+      "content": "儁",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21423": {
+      "content": "瑂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21424": {
+      "content": "趷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21425": {
+      "content": "糴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21426": {
+      "content": "躂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21427": {
+      "content": "顓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21428": {
+      "content": "啋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21429": {
+      "content": "揞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21430": {
+      "content": "罉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21431": {
+      "content": "媺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21432": {
+      "content": "銼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21433": {
+      "content": "癗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21434": {
+      "content": "軻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21435": {
+      "content": "瀡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21436": {
+      "content": "鼴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21437": {
+      "content": "暪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21438": {
+      "content": "摱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21439": {
+      "content": "厏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21440": {
+      "content": "銫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21441": {
+      "content": "綷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21442": {
+      "content": "闓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21443": {
+      "content": "湉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21444": {
+      "content": "鏵",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21445": {
+      "content": "塲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21446": {
+      "content": "彊",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21447": {
+      "content": "氬",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21448": {
+      "content": "繑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21449": {
+      "content": "厓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21450": {
+      "content": "釙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21451": {
+      "content": "鳯",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21452": {
+      "content": "鑌",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21453": {
+      "content": "覲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21454": {
+      "content": "凖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21455": {
+      "content": "嚦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21456": {
+      "content": "翕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21457": {
+      "content": "甂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21458": {
+      "content": "蓀",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21459": {
+      "content": "昰",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21460": {
+      "content": "疎",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21461": {
+      "content": "泂",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21462": {
+      "content": "銥",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21463": {
+      "content": "閆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21464": {
+      "content": "魨",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21465": {
+      "content": "妺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21466": {
+      "content": "訢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21467": {
+      "content": "蠏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21468": {
+      "content": "濶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21469": {
+      "content": "衊",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21470": {
+      "content": "紇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21471": {
+      "content": "僞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21472": {
+      "content": "祆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21473": {
+      "content": "謖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21474": {
+      "content": "炑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21475": {
+      "content": "樨",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21476": {
+      "content": "拕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21477": {
+      "content": "憓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21478": {
+      "content": "蘅",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21479": {
+      "content": "廻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21480": {
+      "content": "馱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21481": {
+      "content": "塹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21482": {
+      "content": "啹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21483": {
+      "content": "昃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21484": {
+      "content": "屘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21485": {
+      "content": "㴓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21486": {
+      "content": "掗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21487": {
+      "content": "騾",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21488": {
+      "content": "苺",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21489": {
+      "content": "駟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21490": {
+      "content": "沚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21491": {
+      "content": "豕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21492": {
+      "content": "奭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21493": {
+      "content": "㦸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21494": {
+      "content": "琿",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21495": {
+      "content": "鬭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21496": {
+      "content": "觜",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21497": {
+      "content": "軭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21498": {
+      "content": "錒",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21499": {
+      "content": "錀",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21500": {
+      "content": "欏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21501": {
+      "content": "仼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21502": {
+      "content": "鏹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21503": {
+      "content": "癆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21504": {
+      "content": "囘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21505": {
+      "content": "鐡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21506": {
+      "content": "鼷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21507": {
+      "content": "羕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21508": {
+      "content": "璩",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21509": {
+      "content": "沊",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21510": {
+      "content": "鵞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21511": {
+      "content": "塡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21512": {
+      "content": "綣",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21513": {
+      "content": "龢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21514": {
+      "content": "禰",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21515": {
+      "content": "鍔",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21516": {
+      "content": "櫟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21517": {
+      "content": "緲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21518": {
+      "content": "蕓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21519": {
+      "content": "齲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21520": {
+      "content": "尙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21521": {
+      "content": "擧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21522": {
+      "content": "勷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21523": {
+      "content": "龑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21524": {
+      "content": "挐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21525": {
+      "content": "鉞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21526": {
+      "content": "鸕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21527": {
+      "content": "愃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21528": {
+      "content": "昅",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21529": {
+      "content": "鎘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21530": {
+      "content": "廡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21531": {
+      "content": "詒",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21532": {
+      "content": "銻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21533": {
+      "content": "咃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21534": {
+      "content": "饉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21535": {
+      "content": "瀦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21536": {
+      "content": "姵",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21537": {
+      "content": "侘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21538": {
+      "content": "垻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21539": {
+      "content": "甑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21540": {
+      "content": "猻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21541": {
+      "content": "扤",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21542": {
+      "content": "禕",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21543": {
+      "content": "鬩",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21544": {
+      "content": "攆",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21545": {
+      "content": "鈹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21546": {
+      "content": "酎",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21547": {
+      "content": "吿",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21548": {
+      "content": "鈷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21549": {
+      "content": "扲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21550": {
+      "content": "竪",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21551": {
+      "content": "柙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21552": {
+      "content": "沔",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21553": {
+      "content": "缐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21554": {
+      "content": "瓘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21555": {
+      "content": "璘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21556": {
+      "content": "璦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21557": {
+      "content": "朏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21558": {
+      "content": "暎",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21559": {
+      "content": "鶿",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21560": {
+      "content": "榘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21561": {
+      "content": "飮",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21562": {
+      "content": "愨",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21563": {
+      "content": "舢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21564": {
+      "content": "僳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21565": {
+      "content": "咼",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21566": {
+      "content": "臏",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21567": {
+      "content": "鑭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21568": {
+      "content": "砬",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21569": {
+      "content": "娸",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21570": {
+      "content": "硃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21571": {
+      "content": "訃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21572": {
+      "content": "朳",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21573": {
+      "content": "瑭",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21574": {
+      "content": "氚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21575": {
+      "content": "綉",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21576": {
+      "content": "痲",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21577": {
+      "content": "麫",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21578": {
+      "content": "齶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21579": {
+      "content": "蚧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21580": {
+      "content": "谿",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21581": {
+      "content": "栱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21582": {
+      "content": "旚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21583": {
+      "content": "鱟",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21584": {
+      "content": "鉍",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21585": {
+      "content": "鏐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21586": {
+      "content": "粢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21587": {
+      "content": "縹",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21588": {
+      "content": "鉬",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21589": {
+      "content": "擗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21590": {
+      "content": "鋯",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21591": {
+      "content": "闐",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21592": {
+      "content": "舨",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21593": {
+      "content": "艶",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21594": {
+      "content": "逄",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21595": {
+      "content": "龠",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21596": {
+      "content": "鮋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21597": {
+      "content": "琚",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21598": {
+      "content": "駡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21599": {
+      "content": "靱",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21600": {
+      "content": "牘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21601": {
+      "content": "樋",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21602": {
+      "content": "鼇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21603": {
+      "content": "慇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21604": {
+      "content": "郃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21605": {
+      "content": "筧",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21606": {
+      "content": "鷓",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21607": {
+      "content": "鴣",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21608": {
+      "content": "鶻",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21609": {
+      "content": "猢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21610": {
+      "content": "癦",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21611": {
+      "content": "逑",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21612": {
+      "content": "囇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21613": {
+      "content": "蘄",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21614": {
+      "content": "鋇",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21615": {
+      "content": "撣",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21616": {
+      "content": "氘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21617": {
+      "content": "駙",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21618": {
+      "content": "椗",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21619": {
+      "content": "尢",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21620": {
+      "content": "㔷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21621": {
+      "content": "炘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21622": {
+      "content": "鴴",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21623": {
+      "content": "鯡",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21624": {
+      "content": "茘",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21625": {
+      "content": "灃",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21626": {
+      "content": "湞",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21627": {
+      "content": "漖",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "max_length": 512,
+  "model_max_length": 512,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

bert/bert_models.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "bert-large-cantonese": {
+        "repo_id": "hon9kon9ize/bert-large-cantonese",
+        "files": [
+            "pytorch_model.bin"
+        ]
+    },
+    "deberta-v3-large": {
+        "repo_id": "microsoft/deberta-v3-large",
+        "files": [
+            "spm.model",
+            "pytorch_model.bin"
+        ]
+    }
+}

bert/deberta-v3-large/.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bert/deberta-v3-large/README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+---
+language: en
+tags:
+  - deberta
+  - deberta-v3
+  - fill-mask
+thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
+license: mit
+---
+## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
+[DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
+In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa,  our V3 version significantly improves the model performance on downstream tasks.  You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
+Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
+The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters  with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer.  This model was trained using the 160GB data as DeBERTa V2.
+#### Fine-tuning on NLU tasks
+We present the dev results on SQuAD 2.0 and MNLI tasks.
+| Model             |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
+|-------------------|----------|-------------------|-----------|----------|
+| RoBERTa-large     |50     |304                | 89.4/86.5 | 90.2   |
+| XLNet-large       |32     |-                  | 90.6/87.9 | 90.8   |
+| DeBERTa-large     |50     |-                  | 90.7/88.0 | 91.3   |
+| **DeBERTa-v3-large**|128|304                  |  **91.5/89.0**| **91.8/91.9**|
+#### Fine-tuning with HF transformers
+```bash
+#!/bin/bash
+cd transformers/examples/pytorch/text-classification/
+pip install datasets
+export TASK_NAME=mnli
+output_dir="ds_results"
+num_gpus=8
+batch_size=8
+python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
+  run_glue.py \
+  --model_name_or_path microsoft/deberta-v3-large \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --evaluation_strategy steps \
+  --max_seq_length 256 \
+  --warmup_steps 50 \
+  --per_device_train_batch_size ${batch_size} \
+  --learning_rate 6e-6 \
+  --num_train_epochs 2 \
+  --output_dir $output_dir \
+  --overwrite_output_dir \
+  --logging_steps 1000 \
+  --logging_dir $output_dir
+```
+### Citation
+If you find DeBERTa useful for your work, please cite the following papers:
+``` latex
+@misc{he2021debertav3,
+      title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
+      author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
+      year={2021},
+      eprint={2111.09543},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+``` latex
+@inproceedings{
+he2021deberta,
+title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
+author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
+booktitle={International Conference on Learning Representations},
+year={2021},
+url={https://openreview.net/forum?id=XPZIaotutsD}
+}
+```

bert/deberta-v3-large/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+		"model_type": "deberta-v2",
+		"attention_probs_dropout_prob": 0.1,
+		"hidden_act": "gelu",
+		"hidden_dropout_prob": 0.1,
+		"hidden_size": 1024,
+		"initializer_range": 0.02,
+		"intermediate_size": 4096,
+		"max_position_embeddings": 512,
+		"relative_attention": true,
+		"position_buckets": 256,
+		"norm_rel_ebd": "layer_norm",
+		"share_att_key": true,
+		"pos_att_type": "p2c|c2p",
+		"layer_norm_eps": 1e-7,
+		"max_relative_positions": -1,
+		"position_biased_input": false,
+		"num_attention_heads": 16,
+		"num_hidden_layers": 24,
+		"type_vocab_size": 0,
+		"vocab_size": 128100
+}

bert/deberta-v3-large/generator_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+	"model_type": "deberta-v2",
+	"attention_probs_dropout_prob": 0.1,
+	"hidden_act": "gelu",
+	"hidden_dropout_prob": 0.1,
+	"hidden_size": 1024,
+	"initializer_range": 0.02,
+	"intermediate_size": 4096,
+	"max_position_embeddings": 512,
+	"relative_attention": true,
+	"position_buckets": 256,
+	"norm_rel_ebd": "layer_norm",
+	"share_att_key": true,
+	"pos_att_type": "p2c|c2p",
+	"layer_norm_eps": 1e-7,
+	"max_relative_positions": -1,
+	"position_biased_input": false,
+	"num_attention_heads": 16,
+	"num_hidden_layers": 12,
+	"type_vocab_size": 0,
+	"vocab_size": 128100
+}

bert/deberta-v3-large/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

bert/deberta-v3-large/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "do_lower_case": false,
+  "vocab_type": "spm"
+}

infer.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+import onnxruntime as ort
+from text import cantonese, english, cleaned_text_to_sequence
+language_module_map = {"EN": english, "YUE": cantonese}
+def clean_text(text, language):
+    language_module = language_module_map[language]
+    norm_text = language_module.text_normalize(text)
+    phones, tones, word2ph = language_module.g2p(norm_text)
+    return norm_text, phones, tones, word2ph
+def convert_pad_shape(pad_shape):
+    layer = pad_shape[::-1]
+    pad_shape = [item for sublist in layer for item in sublist]
+    return pad_shape
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = np.arange(max_length, dtype=length.dtype)
+    return np.expand_dims(x, 0) < np.expand_dims(length, 1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    b, _, t_y, t_x = mask.shape
+    cum_duration = np.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.reshape(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y)
+    path = path.reshape(b, t_x, t_y)
+    path = path ^ np.pad(path, ((0, 0), (1, 0), (0, 0)))[:, :-1]
+    path = np.expand_dims(path, 1).transpose(0, 1, 3, 2)
+    return path
+class OnnxInferenceSession:
+    def __init__(self, path, Providers=["CPUExecutionProvider"]):
+        self.enc = ort.InferenceSession(path["enc"], providers=Providers)
+        self.emb_g = ort.InferenceSession(path["emb_g"], providers=Providers)
+        self.dp = ort.InferenceSession(path["dp"], providers=Providers)
+        self.sdp = ort.InferenceSession(path["sdp"], providers=Providers)
+        self.flow = ort.InferenceSession(path["flow"], providers=Providers)
+        self.dec = ort.InferenceSession(path["dec"], providers=Providers)
+    def __call__(
+        self,
+        seq,
+        tone,
+        language,
+        bert_en,
+        bert_yue,
+        sid,
+        seed=114514,
+        seq_noise_scale=0.8,
+        sdp_noise_scale=0.6,
+        length_scale=1.0,
+        sdp_ratio=0.0,
+    ):
+        if seq.ndim == 1:
+            seq = np.expand_dims(seq, 0)
+        if tone.ndim == 1:
+            tone = np.expand_dims(tone, 0)
+        if language.ndim == 1:
+            language = np.expand_dims(language, 0)
+        assert (seq.ndim == 2, tone.ndim == 2, language.ndim == 2)
+        g = self.emb_g.run(
+            None,
+            {
+                "sid": sid.astype(np.int64),
+            },
+        )[0]
+        g = np.expand_dims(g, -1)
+        enc_rtn = self.enc.run(
+            None,
+            {
+                "x": seq.astype(np.int64),
+                "t": tone.astype(np.int64),
+                "language": language.astype(np.int64),
+                "bert_0": bert_en.astype(np.float32),
+                "bert_1": bert_yue.astype(np.float32),
+                "g": g.astype(np.float32),
+            },
+        )
+        x, m_p, logs_p, x_mask = enc_rtn[0], enc_rtn[1], enc_rtn[2], enc_rtn[3]
+        np.random.seed(seed)
+        zinput = np.random.randn(x.shape[0], 2, x.shape[2]) * sdp_noise_scale
+        logw = self.sdp.run(
+            None, {"x": x, "x_mask": x_mask,
+                   "zin": zinput.astype(np.float32), "g": g}
+        )[0] * (sdp_ratio) + self.dp.run(None, {"x": x, "x_mask": x_mask, "g": g})[
+            0
+        ] * (
+            1 - sdp_ratio
+        )
+        w = np.exp(logw) * x_mask * length_scale
+        w_ceil = np.ceil(w)
+        y_lengths = np.clip(np.sum(w_ceil, (1, 2)), a_min=1.0, a_max=100000).astype(
+            np.int64
+        )
+        y_mask = np.expand_dims(sequence_mask(y_lengths, None), 1)
+        attn_mask = np.expand_dims(x_mask, 2) * np.expand_dims(y_mask, -1)
+        attn = generate_path(w_ceil, attn_mask)
+        m_p = np.matmul(attn.squeeze(1), m_p.transpose(0, 2, 1)).transpose(
+            0, 2, 1
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = np.matmul(attn.squeeze(1), logs_p.transpose(0, 2, 1)).transpose(
+            0, 2, 1
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        z_p = (
+            m_p
+            + np.random.randn(m_p.shape[0], m_p.shape[1], m_p.shape[2])
+            * np.exp(logs_p)
+            * seq_noise_scale
+        )
+        z = self.flow.run(
+            None,
+            {
+                "z_p": z_p.astype(np.float32),
+                "y_mask": y_mask.astype(np.float32),
+                "g": g,
+            },
+        )[0]
+        return self.dec.run(None, {"z_in": z.astype(np.float32), "g": g})[0]

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gardio==4.39.0
+pycantonese==3.4.0
+cn2an==0.5.22
+jieba==0.42.1
+transformers[onnx]==4.42.4
+torch==2.3.1

text/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from text.symbols import *
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+def cleaned_text_to_sequence(cleaned_text, tones, language):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
+    tone_start = language_tone_start_map[language]
+    tones = [i + tone_start for i in tones]
+    lang_id = language_id_map[language]
+    lang_ids = [lang_id for i in phones]
+    return phones, tones, lang_ids
+def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
+    from .english_bert_mock import get_bert_feature as en_bert
+    from .cantonese_bert import get_bert_feature as yue_bert
+    lang_bert_func_map = {"EN": en_bert, "YUE": yue_bert}
+    bert = lang_bert_func_map[language](
+        norm_text, word2ph, device, style_text, style_weight
+    )
+    return bert

text/cantonese.py ADDED Viewed

	@@ -0,0 +1,273 @@

+from text.symbols import punctuation
+import re
+import unicodedata
+import cn2an
+import pycantonese
+import jieba
+import csv
+jieba.load_userdict("./text/yue_dict.txt")
+jyutping_dict = {}
+with open("./text/jyutping.csv", "r", encoding="utf-8") as f:
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue
+        word, jyutping = line.split(",")
+        if word not in jyutping_dict:
+            jyutping_dict[word] = [jyutping]
+        else:
+            jyutping_dict[word].append(jyutping)
+def normalizer(x):
+    x = cn2an.transform(x, "an2cn")
+    return x
+def word2jyutping(word):
+    jyutpings = [pycantonese.characters_to_jyutping(
+        w)[0][1] for w in word if unicodedata.name(w, "").startswith("CJK UNIFIED IDEOGRAPH")]
+    for i, j in enumerate(jyutpings):
+        if re.search(r"^(la|ga)[1-6]$", j):
+            # la1 -> laa1, ga1 -> gaa1
+            jyutpings[i] = jyutpings[i].replace('a', 'aa')
+    if None in jyutpings:
+        raise ValueError(f"Failed to convert {word} to jyutping: {jyutpings}")
+    return " ".join(jyutpings)
+INITIALS = ["", "b", "c", "d", "f", "g", "gw", "h", "j",
+            "k", "kw", "l", "m", "n", "ng", "p", "s", "t", "w", "z"]
+FINALS = ["aa", "aai", "aau", "aam", "aan", "aang", "aap", "aat", "aak", "ai", "au", "am", "an", "ang", "ap", "at", "ak", "e", "ei", "eu", "em", "eng", "ep", "ek", "i", "iu", "im",
+          "in", "ing", "ip", "it", "ik", "o", "oi", "ou", "on", "ong", "ot", "ok", "oe", "oeng", "oek", "eoi", "eon", "eot", "u", "ui", "un", "ung", "ut", "uk", "yu", "yun", "yut", "m", "ng"]
+rep_map = {
+    "：": ",",
+    "︰": ",",
+    "；": ",",
+    "，": ",",
+    "﹐": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "﹖": "?",
+    "﹗": "!",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "丶": ",",
+    "...": "…",
+    "⋯": "…",
+    "$": ".",
+    "“": "'",
+    "”": "'",
+    '"': "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "～": "-",
+    "~": "-",
+    "「": "'",
+    "」": "'",
+    "_": "-",
+}
+replacement_chars = {
+    "\n": " ",
+    'ㄧ': '一',
+    '—': '一',
+    '更': '更',
+    '不': '不',
+    '料': '料',
+    '聯': '聯',
+    '行': '行',
+    '利': '利',
+    '謢': '護',
+    '岀': '出',
+    '鎭': '鎮',
+    '戯': '戲',
+    '旣': '既',
+    '立': '立',
+    '來': '來',
+    '年': '年',
+    '㗇': '蝦',
+}
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    replaced_text = "".join(
+        c for c in replaced_text if unicodedata.name(c, "").startswith("CJK UNIFIED IDEOGRAPH") or c in punctuation
+    )
+    return replaced_text
+def replace_chars(text):
+    for k, v in replacement_chars.items():
+        text = text.replace(k, v)
+    return text
+def word_segmentation(text):
+    words = jieba.cut(text)
+    return words
+def text_normalize(text):
+    text = text.strip()
+    text = normalizer(text)
+    text = replace_punctuation(text)
+    text = replace_chars(text)
+    return text
+def jyuping_to_initials_finals_tones(jyuping_syllables):
+    initials_finals = []
+    tones = []
+    word2ph = []
+    for syllable in jyuping_syllables:
+        if syllable in punctuation:
+            initials_finals.append(syllable)
+            tones.append(0)
+            word2ph.append(1)  # Add 1 for punctuation
+        else:
+            init, final, tone = parse_jyutping(syllable)
+            initials_finals.extend([init, final])
+            tones.extend([tone, tone])
+            word2ph.append(2)
+    assert len(initials_finals) == len(tones)
+    return initials_finals, tones, word2ph
+wordshk_juytping = {}
+# with open("/notebooks/bert-vits2/Bert-VITS2-Cantonese/wordshk_juytping.csv", "r") as csv_file:
+#     csv_reader = csv.reader(csv_file, delimiter=',')
+#     for row in csv_reader:
+#         wordshk_juytping[text_normalize(row[0])] = row[1]
+def get_jyutping(text):
+    if text in wordshk_juytping:
+        return wordshk_juytping[text].split(" ")
+    words = word_segmentation(text)
+    jyutping_array = []
+    for word in words:
+        if word in punctuation:
+            jyutping_array.append(word)
+        else:
+            jyutpings = ""
+            if word in jyutping_dict:
+                jyutpings = jyutping_dict[word][0]
+            else:
+                jyutpings = word2jyutping(word)
+            if 'la1' in jyutpings:
+                print(text, words, jyutpings)
+            # match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
+            if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", jyutpings):
+                raise ValueError(
+                    f"Failed to convert {word} to jyutping: {jyutpings}")
+            jyutping_array.extend(jyutpings.split(" "))
+    return jyutping_array
+def get_bert_feature(text, word2ph):
+    from text import cantonese_bert
+    return cantonese_bert.get_bert_feature(text, word2ph)
+def parse_jyutping(jyutping):
+    orig_jyutping = jyutping
+    if len(jyutping) < 2:
+        raise ValueError(f"Jyutping string too short: {jyutping}")
+    init = ""
+    if jyutping[0] == 'n' and jyutping[1] == 'g' and len(jyutping) == 3:
+        init = ""
+    elif jyutping[0] == 'm' and len(jyutping) == 2:
+        init = ""
+    elif jyutping[0] == 'n' and jyutping[1] == 'g':
+        init = 'ng'
+        jyutping = jyutping[2:]
+    elif jyutping[0] == 'g' and jyutping[1] == 'w':
+        init = 'gw'
+        jyutping = jyutping[2:]
+    elif jyutping[0] == 'k' and jyutping[1] == 'w':
+        init = 'kw'
+        jyutping = jyutping[2:]
+    elif jyutping[0] in 'bpmfdtnlgkhwzcsj':
+        init = jyutping[0]
+        jyutping = jyutping[1:]
+    else:
+        jyutping = jyutping
+    try:
+        tone = int(jyutping[-1])
+        jyutping = jyutping[:-1]
+    except:
+        raise ValueError(
+            f"Jyutping string does not end with a tone number, in {orig_jyutping}")
+    final = jyutping
+    assert init in INITIALS, f"Invalid initial: {init}, in {orig_jyutping}"
+    if final not in FINALS:
+        raise ValueError(f"Invalid final: {final}, in {orig_jyutping}")
+    return [init, final, tone]
+def g2p(text):
+    word2ph = []
+    jyuping = get_jyutping(text)
+    phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
+    phones = ["_"] + phones + ["_"]
+    tones = [0] + tones + [0]
+    word2ph = [1] + word2ph + [1]
+    return phones, tones, word2ph
+if __name__ == "__main__":
+    from text.cantonese_bert import get_bert_feature
+    # text = "Apple BB 你點解會咁柒㗎？我真係唔該晒你呀！123"
+    text = "佢邊係想辭工吖，跳下草裙舞想加人工之嘛。"
+    # text = "我個 app 嘅介紹文想由你寫，因為我唔知從一般用家角度要細緻到乜程度"
+    # text = "佢哋最叻咪就係去㗇人傷害人,得個殼咋!"
+    text = text_normalize(text)
+    print('normalized text', text)
+    phones, tones, word2ph = g2p(text)
+    print(phones, tones, word2ph)
+    bert = get_bert_feature(text, word2ph)
+    print(bert.shape)

text/cantonese_bert.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import sys
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+LOCAL_PATH = "./bert/bert-large-cantonese"
+tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
+models = dict()
+def get_bert_feature(
+    text,
+    word2ph,
+    device="cpu",
+    style_text=None,
+    style_weight=0.7,
+):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = AutoModelForMaskedLM.from_pretrained(
+            LOCAL_PATH).to(device)
+    with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt")
+        for i in inputs:
+            inputs[i] = inputs[i].to(device)
+        res = models[device](**inputs, output_hidden_states=True)
+        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+        if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt")
+            for i in style_inputs:
+                style_inputs[i] = style_inputs[i].to(device)
+            style_res = models[device](
+                **style_inputs, output_hidden_states=True)
+            style_res = torch.cat(
+                style_res["hidden_states"][-3:-2], -1)[0].cpu()
+            style_res_mean = style_res.mean(0)
+    assert len(word2ph) == len(text) + \
+        2, f"{len(word2ph)} != {len(text) + 2}, {word2ph}, {text}"
+    word2phone = word2ph
+    phone_level_feature = []
+    for i in range(len(word2phone)):
+        if style_text:
+            repeat_feature = (
+                res[i].repeat(word2phone[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2phone[i], 1) * style_weight
+            )
+        else:
+            repeat_feature = res[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    return phone_level_feature.T
+if __name__ == "__main__":
+    word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
+    word2phone = [
+        1,
+        2,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        2,
+        1,
+        1,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        2,
+        1,
+        2,
+        2,
+        2,
+        2,
+        1,
+    ]
+    # 计算总帧数
+    total_frames = sum(word2phone)
+    print(word_level_feature.shape)
+    print(word2phone)
+    phone_level_feature = []
+    for i in range(len(word2phone)):
+        print(word_level_feature[i].shape)
+        # 对每个词重复word2phone[i]次
+        repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    print(phone_level_feature.shape)  # torch.Size([36, 1024])

text/cleaner.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from text import cantonese, english, cleaned_text_to_sequence
+language_module_map = {"EN": english, "YUE": cantonese}
+def clean_text(text, language):
+    language_module = language_module_map[language]
+    norm_text = language_module.text_normalize(text)
+    phones, tones, word2ph = language_module.g2p(norm_text)
+    return norm_text, phones, tones, word2ph
+def clean_text_bert(text, language):
+    language_module = language_module_map[language]
+    norm_text = language_module.text_normalize(text)
+    phones, tones, word2ph = language_module.g2p(norm_text)
+    bert = language_module.get_bert_feature(norm_text, word2ph)
+    return phones, tones, bert
+def text_to_sequence(text, language):
+    norm_text, phones, tones, word2ph = clean_text(text, language)
+    return cleaned_text_to_sequence(phones, tones, language)
+if __name__ == "__main__":
+    pass

text/english.py ADDED Viewed

	@@ -0,0 +1,494 @@

+import pickle
+import os
+import re
+from g2p_en import G2p
+from transformers import DebertaV2Tokenizer
+from text import symbols
+from text.symbols import punctuation
+current_file_path = os.path.dirname(__file__)
+CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
+CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
+_g2p = G2p()
+LOCAL_PATH = "./bert/deberta-v3-large"
+tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "…": "...",
+        "···": "...",
+        "・・・": "...",
+        "v": "V",
+    }
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    if ph in symbols:
+        return ph
+    if ph not in symbols:
+        ph = "UNK"
+    return ph
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "．": ".",
+    "…": "...",
+    "···": "...",
+    "・・・": "...",
+    "·": ",",
+    "・": ",",
+    "、": ",",
+    "$": ".",
+    "“": "'",
+    "”": "'",
+    '"': "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "−": "-",
+    "～": "-",
+    "~": "-",
+    "「": "'",
+    "」": "'",
+}
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    # replaced_text = re.sub(
+    #     r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
+    #     + "".join(punctuation)
+    #     + r"]+",
+    #     "",
+    #     replaced_text,
+    # )
+    return replaced_text
+def read_dict():
+    g2p_dict = {}
+    start_line = 49
+    with open(CMU_DICT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= start_line:
+                line = line.strip()
+                word_split = line.split("  ")
+                word = word_split[0]
+                syllable_split = word_split[1].split(" - ")
+                g2p_dict[word] = []
+                for syllable in syllable_split:
+                    phone_split = syllable.split(" ")
+                    g2p_dict[word].append(phone_split)
+            line_index = line_index + 1
+            line = f.readline()
+    return g2p_dict
+def cache_dict(g2p_dict, file_path):
+    with open(file_path, "wb") as pickle_file:
+        pickle.dump(g2p_dict, pickle_file)
+def get_dict():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, "rb") as pickle_file:
+            g2p_dict = pickle.load(pickle_file)
+    else:
+        g2p_dict = read_dict()
+        cache_dict(g2p_dict, CACHE_PATH)
+    return g2p_dict
+eng_dict = get_dict()
+def refine_ph(phn):
+    tone = 0
+    if re.search(r"\d$", phn):
+        tone = int(phn[-1]) + 1
+        phn = phn[:-1]
+    else:
+        tone = 3
+    return phn.lower(), tone
+def refine_syllables(syllables):
+    tones = []
+    phonemes = []
+    for phn_list in syllables:
+        for i in range(len(phn_list)):
+            phn = phn_list[i]
+            phn, tone = refine_ph(phn)
+            phonemes.append(phn)
+            tones.append(tone)
+    return phonemes, tones
+import inflect
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+# List of (ipa, lazy ipa) pairs:
+_lazy_ipa = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        ("r", "ɹ"),
+        ("æ", "e"),
+        ("ɑ", "a"),
+        ("ɔ", "o"),
+        ("ð", "z"),
+        ("θ", "s"),
+        ("ɛ", "e"),
+        ("ɪ", "i"),
+        ("ʊ", "u"),
+        ("ʒ", "ʥ"),
+        ("ʤ", "ʥ"),
+        ("ˈ", "↓"),
+    ]
+]
+# List of (ipa, lazy ipa2) pairs:
+_lazy_ipa2 = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        ("r", "ɹ"),
+        ("ð", "z"),
+        ("θ", "s"),
+        ("ʒ", "ʑ"),
+        ("ʤ", "dʑ"),
+        ("ˈ", "↓"),
+    ]
+]
+# List of (ipa, ipa2) pairs
+_ipa_to_ipa2 = [
+    (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
+]
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+def text_normalize(text):
+    text = normalize_numbers(text)
+    text = replace_punctuation(text)
+    text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
+    return text
+def distribute_phone(n_phone, n_word):
+    phones_per_word = [0] * n_word
+    for task in range(n_phone):
+        min_tasks = min(phones_per_word)
+        min_index = phones_per_word.index(min_tasks)
+        phones_per_word[min_index] += 1
+    return phones_per_word
+def sep_text(text):
+    words = re.split(r"([,;.\?\!\s+])", text)
+    words = [word for word in words if word.strip() != ""]
+    return words
+def text_to_words(text):
+    tokens = tokenizer.tokenize(text)
+    words = []
+    for idx, t in enumerate(tokens):
+        if t.startswith("▁"):
+            words.append([t[1:]])
+        else:
+            if t in punctuation:
+                if idx == len(tokens) - 1:
+                    words.append([f"{t}"])
+                else:
+                    if (
+                        not tokens[idx + 1].startswith("▁")
+                        and tokens[idx + 1] not in punctuation
+                    ):
+                        if idx == 0:
+                            words.append([])
+                        words[-1].append(f"{t}")
+                    else:
+                        words.append([f"{t}"])
+            else:
+                if idx == 0:
+                    words.append([])
+                words[-1].append(f"{t}")
+    return words
+def g2p(text):
+    phones = []
+    tones = []
+    phone_len = []
+    # words = sep_text(text)
+    # tokens = [tokenizer.tokenize(i) for i in words]
+    words = text_to_words(text)
+    for word in words:
+        temp_phones, temp_tones = [], []
+        if len(word) > 1:
+            if "'" in word:
+                word = ["".join(word)]
+        for w in word:
+            if w in punctuation:
+                temp_phones.append(w)
+                temp_tones.append(0)
+                continue
+            if w.upper() in eng_dict:
+                phns, tns = refine_syllables(eng_dict[w.upper()])
+                temp_phones += [post_replace_ph(i) for i in phns]
+                temp_tones += tns
+                # w2ph.append(len(phns))
+            else:
+                phone_list = list(filter(lambda p: p != " ", _g2p(w)))
+                phns = []
+                tns = []
+                for ph in phone_list:
+                    if ph in arpa:
+                        ph, tn = refine_ph(ph)
+                        phns.append(ph)
+                        tns.append(tn)
+                    else:
+                        phns.append(ph)
+                        tns.append(0)
+                temp_phones += [post_replace_ph(i) for i in phns]
+                temp_tones += tns
+        phones += temp_phones
+        tones += temp_tones
+        phone_len.append(len(temp_phones))
+        # phones = [post_replace_ph(i) for i in phones]
+    word2ph = []
+    for token, pl in zip(words, phone_len):
+        word_len = len(token)
+        aaa = distribute_phone(pl, word_len)
+        word2ph += aaa
+    phones = ["_"] + phones + ["_"]
+    tones = [0] + tones + [0]
+    word2ph = [1] + word2ph + [1]
+    assert len(phones) == len(tones), text
+    assert len(phones) == sum(word2ph), text
+    return phones, tones, word2ph
+def get_bert_feature(text, word2ph):
+    from text import english_bert_mock
+    return english_bert_mock.get_bert_feature(text, word2ph)
+if __name__ == "__main__":
+    # print(get_dict())
+    # print(eng_word_to_phoneme("hello"))
+    print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
+    # all_phones = set()
+    # for k, syllables in eng_dict.items():
+    #     for group in syllables:
+    #         for ph in group:
+    #             all_phones.add(ph)
+    # print(all_phones)

text/english_bert_mock.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import sys
+import torch
+from transformers import DebertaV2Model, DebertaV2Tokenizer
+LOCAL_PATH = "./bert/deberta-v3-large"
+tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
+models = dict()
+def get_bert_feature(
+    text,
+    word2ph,
+    device="cpu",
+    style_text=None,
+    style_weight=0.7,
+):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
+    with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt")
+        for i in inputs:
+            inputs[i] = inputs[i].to(device)
+        res = models[device](**inputs, output_hidden_states=True)
+        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
+        if style_text:
+            style_inputs = tokenizer(style_text, return_tensors="pt")
+            for i in style_inputs:
+                style_inputs[i] = style_inputs[i].to(device)
+            style_res = models[device](**style_inputs, output_hidden_states=True)
+            style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
+            style_res_mean = style_res.mean(0)
+    assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
+    word2phone = word2ph
+    phone_level_feature = []
+    for i in range(len(word2phone)):
+        if style_text:
+            repeat_feature = (
+                res[i].repeat(word2phone[i], 1) * (1 - style_weight)
+                + style_res_mean.repeat(word2phone[i], 1) * style_weight
+            )
+        else:
+            repeat_feature = res[i].repeat(word2phone[i], 1)
+        phone_level_feature.append(repeat_feature)
+    phone_level_feature = torch.cat(phone_level_feature, dim=0)
+    return phone_level_feature.T

text/jyutping.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

text/symbols.py ADDED Viewed

	@@ -0,0 +1,152 @@

+punctuation = ["!", "?", "…", ",", ".", "'", "-"]
+pu_symbols = punctuation + ["SP", "UNK"]
+pad = "_"
+# English
+en_symbols = [
+    "aa",
+    "ae",
+    "ah",
+    "ao",
+    "aw",
+    "ay",
+    "b",
+    "ch",
+    "d",
+    "dh",
+    "eh",
+    "er",
+    "ey",
+    "f",
+    "g",
+    "hh",
+    "ih",
+    "iy",
+    "jh",
+    "k",
+    "l",
+    "m",
+    "n",
+    "ng",
+    "ow",
+    "oy",
+    "p",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "th",
+    "uh",
+    "uw",
+    "V",
+    "w",
+    "y",
+    "z",
+    "zh",
+]
+num_en_tones = 4
+# Cantonese
+yue_symbols = [
+    "",
+    "aa",
+    "aai",
+    "aak",
+    "aam",
+    "aan",
+    "aang",
+    "aap",
+    "aat",
+    "aau",
+    "ai",
+    "ak",
+    "am",
+    "an",
+    "ang",
+    "ap",
+    "at",
+    "au",
+    "b",
+    "c",
+    "d",
+    "e",
+    "ei",
+    "ek",
+    "em",
+    "eng",
+    "eoi",
+    "eon",
+    "eot",
+    "ep",
+    "eu",
+    "f",
+    "g",
+    "gw",
+    "h",
+    "i",
+    "ik",
+    "im",
+    "in",
+    "ing",
+    "ip",
+    "it",
+    "iu",
+    "j",
+    "k",
+    "kw",
+    "l",
+    "m",
+    "m",
+    "n",
+    "ng",
+    "ng",
+    "o",
+    "oe",
+    "oek",
+    "oeng",
+    "oi",
+    "ok",
+    "on",
+    "ong",
+    "ot",
+    "ou",
+    "p",
+    "s",
+    "t",
+    "u",
+    "ui",
+    "uk",
+    "un",
+    "ung",
+    "ut",
+    "w",
+    "yu",
+    "yun",
+    "yut",
+    "z"
+]
+num_yue_tones = 7
+# combine all symbols
+normal_symbols = sorted(
+    set(en_symbols + yue_symbols))
+symbols = [pad] + normal_symbols + pu_symbols
+sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
+# combine all tones
+num_tones = num_en_tones + num_yue_tones
+# language maps
+language_id_map = {"EN": 0, "YUE": 1}
+num_languages = len(language_id_map.keys())
+language_tone_start_map = {
+    "EN": 0,
+    "YUE": num_en_tones,
+}
+if __name__ == "__main__":
+    a = set(yue_symbols)
+    b = set(en_symbols)
+    print(sorted(a & b))

text/yue_dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff