File size: 3,668 Bytes
960cd20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import regex as re

try:
    from contants import config
except:
    pass

langid_languages = ["af", "am", "an", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy", "da", "de",
                    "dz", "el",
                    "en", "eo", "es", "et", "eu", "fa", "fi", "fo", "fr", "ga", "gl", "gu", "he", "hi", "hr", "ht",
                    "hu", "hy",
                    "id", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "ku", "ky", "la", "lb", "lo", "lt",
                    "lv", "mg",
                    "mk", "ml", "mn", "mr", "ms", "mt", "nb", "ne", "nl", "nn", "no", "oc", "or", "pa", "pl", "ps",
                    "pt", "qu",
                    "ro", "ru", "rw", "se", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", "tl", "tr",
                    "ug", "uk",
                    "ur", "vi", "vo", "wa", "xh", "zh", "zu"]


def classify_language(text: str, target_languages: list = None) -> str:
    try:
        module = config.language_identification.language_identification_library.lower()
    except:
        module = "langid"

    if not target_languages:
        target_languages = None

    if module == "fastlid" or module == "fasttext":
        from fastlid import fastlid, supported_langs
        classifier = fastlid
        if target_languages is not None:
            target_languages = [lang for lang in target_languages if lang in supported_langs]
            fastlid.set_languages = target_languages
    elif module == "langid":
        import langid
        classifier = langid.classify
        if target_languages is not None:
            target_languages = [lang for lang in target_languages if lang in langid_languages]
            langid.set_languages(target_languages)
    else:
        raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py")

    lang = classifier(text)[0]

    return lang


# def classify_zh_ja(text: str) -> str:
#     for idx, char in enumerate(text):
#         unicode_val = ord(char)
#
#         # 检测日语字符
#         if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
#             return "ja"
#
#         # 检测汉字字符
#         if 0x4E00 <= unicode_val <= 0x9FFF:
#             # 检查周围的字符
#             next_char = text[idx + 1] if idx + 1 < len(text) else None
#
#             if next_char and (0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF):
#                 return "ja"
#
#     return "zh"


def split_alpha_nonalpha(text, mode=1):
    """
    Splits the input text based on the specified mode.

    Parameters:
    - text (str): The input text to be split.
    - mode (int): The mode for splitting (1 or 2).
        - Mode 1: Splits based on the pattern - Chinese/Japanese followed by English or vice versa.
        - Mode 2: Splits based on the pattern - Chinese/Japanese followed by English/digit or vice versa.

    Returns:
    - list: A list of substrings after the split.
    """
    if mode == 1:
        pattern = r'(?<=[\u4e00-\u9fff\u3040-\u30FF\d\s])(?=[\p{Latin}])|(?<=[\p{Latin}\s])(?=[\u4e00-\u9fff\u3040-\u30FF\d])'
    elif mode == 2:
        pattern = r'(?<=[\u4e00-\u9fff\u3040-\u30FF\s])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d\s])(?=[\u4e00-\u9fff\u3040-\u30FF])'
    else:
        raise ValueError("Invalid mode. Supported modes are 1 and 2.")

    return re.split(pattern, text)


if __name__ == "__main__":
    text = "这是一个测试文本"
    print(classify_language(text))
    # print(classify_zh_ja(text))  # "zh"

    text = "これはテストテキストです"
    print(classify_language(text))
    # print(classify_zh_ja(text))  # "ja"