import re
import unicodedata
import cn2an
import pinyin_jyutping
import pycantonese

from text.symbols import punctuation
# from symbols import punctuation

from functools import reduce


def normalizer(x):
    return cn2an.transform(x, "an2cn")


j = pinyin_jyutping.PinyinJyutping()

INITIALS = [
    "äi",
    "äm",
    "äng",
    "äu",
    "äp",
    "ät",
    "äk",
    "æ",
    "a",
    "p",
    "b",
    "e",
    "ts",
    "t",
    "dz",
    "d",
    "kw",
    "k",
    "gw",
    "g",
    "f",
    "h",
    "l",
    "m",
    "ng",
    "n",
    "s",
    "w",
    "c",
    "z",
    "y",
    "ong",
    "on",
    "ou",
    "oi",
    "ok",
    "o",
    "uk",
    "ung",
]


rep_map = {
    "￼": ",",
    "：": ",",
    "︰": ",",
    "；": ",",
    "，": ",",
    "。": ".",
    "！": "!",
    "？": "?",
    "﹖": "?",
    "﹗": "!",
    "\n": ".",
    "·": ",",
    "、": ",",
    "丶": ",",
    "...": "…",
    "⋯": "…",
    "$": ".",
    "“": "'",
    "”": "'",
    '"': "'",
    "‘": "'",
    "’": "'",
    "（": "'",
    "）": "'",
    "(": "'",
    ")": "'",
    "《": "'",
    "》": "'",
    "【": "'",
    "】": "'",
    "[": "'",
    "]": "'",
    "—": "-",
    "～": "-",
    "~": "-",
    "「": "'",
    "」": "'",
    "_": "-",
}

replacement_chars = {
    "ㄧ": "一",
    "—": "一",
    "更": "更",
    "不": "不",
    "料": "料",
    "聯": "聯",
    "行": "行",
    "利": "利",
    "謢": "護",
    "岀": "出",
    "鎭": "鎮",
    "戯": "戲",
    "旣": "既",
    "立": "立",
    "來": "來",
    "年": "年",
    "㗇": "蝦",
}


def replace_punctuation(text):
    # text = text.replace("嗯", "恩").replace("呣", "母")
    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))

    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)

    replaced_text = "".join(
        c
        for c in replaced_text
        if unicodedata.name(c, "").startswith("CJK UNIFIED IDEOGRAPH")
        or c in punctuation
    )

    return replaced_text


def replace_chars(text):
    for k, v in replacement_chars.items():
        text = text.replace(k, v)
    return text


def text_normalize(text):
    text = normalizer(text)
    text = replace_punctuation(text)
    text = replace_chars(text)
    return text


def rom_to_initials_finals_tones(jyuping_syllables):
    initials_finals = []
    tones = []
    word2ph = []

    for syllable in jyuping_syllables:
        if syllable in punctuation:
            initials_finals.append(syllable)
            tones.append(0)
            word2ph.append(1)  # Add 1 for punctuation
        else:
            try:
                tone = int(syllable[-1])
                syllable_without_tone = syllable[:-1]
            except ValueError:
                tone = 0
                syllable_without_tone = syllable

            assert str(tone) in "1234560"

            for initial in INITIALS:
                if syllable_without_tone.startswith(initial):
                    if syllable_without_tone.startswith("nga"):
                        initials_finals.extend(
                            [
                                syllable_without_tone[:2],
                                syllable_without_tone[2:] or syllable_without_tone[-1],
                            ]
                        )
                        tones.extend([tone, tone])
                        word2ph.append(2)
                    else:
                        final = syllable_without_tone[len(initial) :] or initial[-1]
                        initials_finals.extend([initial, final])
                        tones.extend([tone, tone])
                        word2ph.append(2)
                    break
    print(initials_finals)
    assert len(initials_finals) == len(tones)
    assert sum(word2ph) == len(initials_finals)
    return initials_finals, tones, word2ph


def get_jyutping(text):
    converted_text = j.jyutping(text, tone_numbers=True, spaces=True)
    converted_words = converted_text.split()

    # # replace ... with …
    # converted_text = re.sub(r"\.{2,}", "…", converted_text)
    # # replace -- with -
    # converted_text = re.sub(r"-{2,}", "-", converted_text)

    for i, word in enumerate(converted_words):
        if set(word) <= set(text) - set(punctuation):
            converted_word = pycantonese.characters_to_jyutping(word)[0][1]
            converted_words[i] = converted_word

        if (
            converted_words[i] not in punctuation
            and re.search(r"^[a-zA-Z]+[1-6]$", converted_words[i]) is None
        ):
            raise ValueError(
                f"Failed to convert {converted_words[i]}, {converted_text}"
            )

    jyutping_sentence = " ".join(converted_words)

    for symbol in punctuation:
        jyutping_sentence = jyutping_sentence.replace(symbol, " " + symbol + " ")
    jyutping_array = jyutping_sentence.split()

    return jyutping_array


def jyutping2waitau(j):
    ROM_MAPPING = {
        "a": "ä",
        "ää": "a",
        "ae": "æ",
        "oe": "ö",
        "eo": "ö",
        "yu": "ü",
        "j": "y",
    }

    return re.sub(
        "(g|k)u(?!ng|k)",
        "\\1wu",
        reduce(lambda pron, rule: pron.replace(*rule), ROM_MAPPING.items(), j),
    )


def get_bert_feature(text, word2ph):
    from text import cantonese_bert

    return cantonese_bert.get_bert_feature(text, word2ph)


def g2p(text, g2p_bypass=False):
    word2ph = []
    # if not g2p_bypass:
    #     jyuping = get_jyutping(text)
    #     rom = [jyutping2waitau(j) for j in jyuping]
    #     phones, tones, word2ph = rom_to_initials_finals_tones(rom)
    # else:
    # text.replace("aik", "æk")
    phones, tones, word2ph = rom_to_initials_finals_tones(text)
    phones = ["_"] + phones + ["_"]
    tones = [0] + tones + [0]
    word2ph = [1] + word2ph + [1]
    return phones, tones, word2ph


def test_dataset(dataset, metadata):
    import csv
    import tqdm

    with open(metadata, "r", encoding="utf-8") as _file_:
        if dataset == "ciugo":
            reader = list(csv.reader(_file_, delimiter="|"))
            for row in tqdm.tqdm(reader, desc="Processing dataset"):
                _, _, rom_text = row
                rom_syllables = rom_text.split()
                try:
                    phones, tones, word2ph = rom_to_initials_finals_tones(rom_syllables)
                    if not len(word2ph) == len(text):
                        print(f"word2ph not fit!: {rom_text}")
                        print(f"phones: {phones}")
                        print(f"tones: {tones}")
                        print(f"word2ph: {word2ph}")
                    assert len(word2ph) == len(text)
                    # print(phones)
                except Exception as e:
                    # print(f"Error converting line: {row}")
                    # print(f"Exception: {e}")
                    print("")
        else:
            with open(metadata, "r", encoding="utf-8") as _file_:
                for line in _file_:
                    text = line.strip().split("|")[-1]
                    text = text_normalize(text)
                    try:
                        phones, tones, word2ph = g2p(text)
                        if not len(word2ph) == len(text) + 2:
                            print(f"word2ph not fit!: {text}")
                            print(f"phones: {phones}")
                            print(f"tones: {tones}")
                            print(f"word2ph: {word2ph}")
                        assert len(word2ph) == len(text) + 2
                        # print(phones)
                    except Exception as e:
                        # print(f"Error converting text: {text}")
                        # print(f"Exception: {e}")
                        print("")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", type=str, choices=["ciugo", "list"])
    parser.add_argument("--metadata", type=str)
    args = parser.parse_args()

    if args.dataset:
        if args.metadata is None:
            args.metadata = "./metadata.csv"
        test_dataset(args.dataset, args.metadata)
    else:
        g2p_bypass = False
        # from text.cantonese_bert import get_bert_feature

        # text = "你點解會咁柒㗎？我真係唔該晒你呀！"
        text = "佢哋最叻咪就係去㗇人傷害人,得個殼咋!"
        text = "不妨聽聽西廂記裏面鶯鶯嘅唱詞." # g2p_bypass = False
        text = "ni1 seng4 yäk6 co1 go2 täu4" # g2p_bypass = True
        text = "咗"

        if not g2p_bypass:
            text = text_normalize(text)
            print(text)
        else:
            text = text.split() # text: list
            print(text)
        phones, tones, word2ph = g2p(text, g2p_bypass)
        # bert = get_bert_feature(text, word2ph)

        # print(phones, tones, word2ph, bert.shape)
        print(phones, tones, word2ph)