|
import regex as re |
|
|
|
try: |
|
from config import config |
|
|
|
LANGUAGE_IDENTIFICATION_LIBRARY = ( |
|
config.webui_config.language_identification_library |
|
) |
|
except: |
|
LANGUAGE_IDENTIFICATION_LIBRARY = "langid" |
|
|
|
module = LANGUAGE_IDENTIFICATION_LIBRARY.lower() |
|
|
|
langid_languages = [ |
|
"af", |
|
"am", |
|
"an", |
|
"ar", |
|
"as", |
|
"az", |
|
"be", |
|
"bg", |
|
"bn", |
|
"br", |
|
"bs", |
|
"ca", |
|
"cs", |
|
"cy", |
|
"da", |
|
"de", |
|
"dz", |
|
"el", |
|
"en", |
|
"eo", |
|
"es", |
|
"et", |
|
"eu", |
|
"fa", |
|
"fi", |
|
"fo", |
|
"fr", |
|
"ga", |
|
"gl", |
|
"gu", |
|
"he", |
|
"hi", |
|
"hr", |
|
"ht", |
|
"hu", |
|
"hy", |
|
"id", |
|
"is", |
|
"it", |
|
"ja", |
|
"jv", |
|
"ka", |
|
"kk", |
|
"km", |
|
"kn", |
|
"ko", |
|
"ku", |
|
"ky", |
|
"la", |
|
"lb", |
|
"lo", |
|
"lt", |
|
"lv", |
|
"mg", |
|
"mk", |
|
"ml", |
|
"mn", |
|
"mr", |
|
"ms", |
|
"mt", |
|
"nb", |
|
"ne", |
|
"nl", |
|
"nn", |
|
"no", |
|
"oc", |
|
"or", |
|
"pa", |
|
"pl", |
|
"ps", |
|
"pt", |
|
"qu", |
|
"ro", |
|
"ru", |
|
"rw", |
|
"se", |
|
"si", |
|
"sk", |
|
"sl", |
|
"sq", |
|
"sr", |
|
"sv", |
|
"sw", |
|
"ta", |
|
"te", |
|
"th", |
|
"tl", |
|
"tr", |
|
"ug", |
|
"uk", |
|
"ur", |
|
"vi", |
|
"vo", |
|
"wa", |
|
"xh", |
|
"zh", |
|
"zu", |
|
] |
|
|
|
|
|
def classify_language(text: str, target_languages: list = None) -> str: |
|
if module == "fastlid" or module == "fasttext": |
|
from fastlid import fastlid, supported_langs |
|
|
|
classifier = fastlid |
|
if target_languages != None: |
|
target_languages = [ |
|
lang for lang in target_languages if lang in supported_langs |
|
] |
|
fastlid.set_languages = target_languages |
|
elif module == "langid": |
|
import langid |
|
|
|
classifier = langid.classify |
|
if target_languages != None: |
|
target_languages = [ |
|
lang for lang in target_languages if lang in langid_languages |
|
] |
|
langid.set_languages(target_languages) |
|
else: |
|
raise ValueError(f"Wrong module {module}") |
|
|
|
lang = classifier(text)[0] |
|
|
|
return lang |
|
|
|
|
|
def classify_zh_ja(text: str) -> str: |
|
for idx, char in enumerate(text): |
|
unicode_val = ord(char) |
|
|
|
|
|
if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF: |
|
return "ja" |
|
|
|
|
|
if 0x4E00 <= unicode_val <= 0x9FFF: |
|
|
|
next_char = text[idx + 1] if idx + 1 < len(text) else None |
|
|
|
if next_char and ( |
|
0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF |
|
): |
|
return "ja" |
|
|
|
return "zh" |
|
|
|
|
|
def split_alpha_nonalpha(text, mode=1): |
|
if mode == 1: |
|
pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d])(?=[\p{Latin}])|(?<=[\p{Latin}])(?=[\u4e00-\u9fff\u3040-\u30FF\d])" |
|
elif mode == 2: |
|
pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d])(?=[\u4e00-\u9fff\u3040-\u30FF])" |
|
else: |
|
raise ValueError("Invalid mode. Supported modes are 1 and 2.") |
|
|
|
return re.split(pattern, text) |
|
|
|
|
|
if __name__ == "__main__": |
|
text = "这是一个测试文本" |
|
print(classify_language(text)) |
|
print(classify_zh_ja(text)) |
|
|
|
text = "これはテストテキストです" |
|
print(classify_language(text)) |
|
print(classify_zh_ja(text)) |
|
|
|
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days" |
|
|
|
print(split_alpha_nonalpha(text, mode=1)) |
|
|
|
|
|
print(split_alpha_nonalpha(text, mode=2)) |
|
|
|
|