Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

xu-song commited on May 1, 2024

Commit

9558ae0

1 Parent(s): 1b7fc74

update

Browse files

Files changed (1) hide show

utils/lang_util_2.py +139 -0

utils/lang_util_2.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+日语、韩语 等
+https://www.cnblogs.com/luoganttcc/p/16605150.html
+https://zhuanlan.zhihu.com/p/618684374
+- https://zhuanlan.zhihu.com/p/84625185 赞
+## 相关包
+import opencc
+import langid
+imort langdetect
+https://github.com/pemistahl/lingua-py
+  - 原理：
+"""
+from zhon.hanzi import punctuation as zh_punc
+def is_zh_char(uchar):
+    """
+    https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
+    re.compile("([\u4E00-\u9FD5]+)", re.U)
+    """
+    return u'\u4e00' <= uchar <= u'\u9fa5'
+def has_zh_punc(text):
+    """
+    是否包含中文标点
+    """
+    return any(ch in zh_punc for ch in text)
+def has_zh(text):
+    """ contains Chinese characters """
+    return any(is_zh_char(ch) for ch in text)
+def get_zh_count(text):
+    return sum([is_zh_char(uchar) for uchar in text])
+def is_all_zh(text):
+    return all(is_zh_char(char) for char in text)
+def is_all_en(text):
+    return text.encode('utf-8').isalpha()
+# import opencc
+def is_russian():
+    """ 俄语 """
+    pass
+def is_french():
+    """ 法语 """
+def aa():
+    """
+    zh-Hans: Chinese (Simplified)
+    :return:
+    """
+    pass
+def bb():
+    """
+    zh-Hant: Chinese (Traditional)
+    :return:
+    """
+ranges = [
+    {"from": ord(u"\u3300"), "to": ord(u"\u33ff")},  # compatibility ideographs
+    {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")},  # compatibility ideographs
+    {"from": ord(u"\uf900"), "to": ord(u"\ufaff")},  # compatibility ideographs
+    {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")},  # compatibility ideographs
+    {'from': ord(u'\u3040'), 'to': ord(u'\u309f')},  # Japanese Hiragana  日本平假名 96个
+    {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")},  # Japanese Katakana  日语片假名 96个
+    {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")},  # cjk radicals supplement
+    {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},  # 中文  u"\u4e00"-'\u9fa5'，
+    {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},  #
+    {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
+    {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
+    {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
+    {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")}  # included as of Unicode 8.0
+]
+# 韩语 [\uac00-\ud7ff]
+def is_cjk(char):
+    """
+    CJK（Chinese、Japanese、Korean）
+    日语中有很多汉字，日本汉字超过2万。
+    韩语有谚文，超过50个，有朝鲜汉字超过2万。
+    """
+    return any([range["from"] <= ord(char) <= range["to"] for range in ranges])
+def cjk_substrings(string):
+    i = 0
+    while i < len(string):
+        if is_cjk(string[i]):
+            start = i
+            while is_cjk(string[i]): i += 1
+            yield string[start:i]
+        i += 1
+def aa():
+    # string = "sdf344asfasf天地方益3権sdfsdf".decode("utf-8")
+    for idx, item in enumerate(ranges):
+        print(idx, end=": ")
+        for j in range(10):
+            print(chr(item["from"] + j), end=", ")
+        print("")
+    # for sub in cjk_substrings(string):
+    #   string = string.replace(sub, "(" + sub + ")")
+    # print(string)
+def is_traditional_chinese(text):
+    cc = opencc.OpenCC('t2s')
+    converted_text = cc.convert(text)
+    if converted_text != text:
+        return True
+    return False
+    # aa()