Spaces:
Running
Running
File size: 3,668 Bytes
960cd20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import regex as re
try:
from contants import config
except:
pass
langid_languages = ["af", "am", "an", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy", "da", "de",
"dz", "el",
"en", "eo", "es", "et", "eu", "fa", "fi", "fo", "fr", "ga", "gl", "gu", "he", "hi", "hr", "ht",
"hu", "hy",
"id", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "ku", "ky", "la", "lb", "lo", "lt",
"lv", "mg",
"mk", "ml", "mn", "mr", "ms", "mt", "nb", "ne", "nl", "nn", "no", "oc", "or", "pa", "pl", "ps",
"pt", "qu",
"ro", "ru", "rw", "se", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", "tl", "tr",
"ug", "uk",
"ur", "vi", "vo", "wa", "xh", "zh", "zu"]
def classify_language(text: str, target_languages: list = None) -> str:
try:
module = config.language_identification.language_identification_library.lower()
except:
module = "langid"
if not target_languages:
target_languages = None
if module == "fastlid" or module == "fasttext":
from fastlid import fastlid, supported_langs
classifier = fastlid
if target_languages is not None:
target_languages = [lang for lang in target_languages if lang in supported_langs]
fastlid.set_languages = target_languages
elif module == "langid":
import langid
classifier = langid.classify
if target_languages is not None:
target_languages = [lang for lang in target_languages if lang in langid_languages]
langid.set_languages(target_languages)
else:
raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py")
lang = classifier(text)[0]
return lang
# def classify_zh_ja(text: str) -> str:
# for idx, char in enumerate(text):
# unicode_val = ord(char)
#
# # 检测日语字符
# if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
# return "ja"
#
# # 检测汉字字符
# if 0x4E00 <= unicode_val <= 0x9FFF:
# # 检查周围的字符
# next_char = text[idx + 1] if idx + 1 < len(text) else None
#
# if next_char and (0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF):
# return "ja"
#
# return "zh"
def split_alpha_nonalpha(text, mode=1):
"""
Splits the input text based on the specified mode.
Parameters:
- text (str): The input text to be split.
- mode (int): The mode for splitting (1 or 2).
- Mode 1: Splits based on the pattern - Chinese/Japanese followed by English or vice versa.
- Mode 2: Splits based on the pattern - Chinese/Japanese followed by English/digit or vice versa.
Returns:
- list: A list of substrings after the split.
"""
if mode == 1:
pattern = r'(?<=[\u4e00-\u9fff\u3040-\u30FF\d\s])(?=[\p{Latin}])|(?<=[\p{Latin}\s])(?=[\u4e00-\u9fff\u3040-\u30FF\d])'
elif mode == 2:
pattern = r'(?<=[\u4e00-\u9fff\u3040-\u30FF\s])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d\s])(?=[\u4e00-\u9fff\u3040-\u30FF])'
else:
raise ValueError("Invalid mode. Supported modes are 1 and 2.")
return re.split(pattern, text)
if __name__ == "__main__":
text = "这是一个测试文本"
print(classify_language(text))
# print(classify_zh_ja(text)) # "zh"
text = "これはテストテキストです"
print(classify_language(text))
# print(classify_zh_ja(text)) # "ja"
|