HaloMaster's picture
add fengshen
50f0fbb
raw
history blame
1.91 kB
# coding=utf-8
import jieba
def jieba_tokenize(str):
return jieba.lcut(str)
_UCODE_RANGES = (
("\u3400", "\u4db5"), # CJK Unified Ideographs Extension A, release 3.0
("\u4e00", "\u9fa5"), # CJK Unified Ideographs, release 1.1
("\u9fa6", "\u9fbb"), # CJK Unified Ideographs, release 4.1
("\uf900", "\ufa2d"), # CJK Compatibility Ideographs, release 1.1
("\ufa30", "\ufa6a"), # CJK Compatibility Ideographs, release 3.2
("\ufa70", "\ufad9"), # CJK Compatibility Ideographs, release 4.1
("\u20000", "\u2a6d6"), # (UTF16) CJK Unified Ideographs Extension B, release 3.1
("\u2f800", "\u2fa1d"), # (UTF16) CJK Compatibility Supplement, release 3.1
("\uff00", "\uffef"), # Full width ASCII, full width of English punctuation,
# half width Katakana, half wide half width kana, Korean alphabet
("\u2e80", "\u2eff"), # CJK Radicals Supplement
("\u3000", "\u303f"), # CJK punctuation mark
("\u31c0", "\u31ef"), # CJK stroke
("\u2f00", "\u2fdf"), # Kangxi Radicals
("\u2ff0", "\u2fff"), # Chinese character structure
("\u3100", "\u312f"), # Phonetic symbols
("\u31a0", "\u31bf"), # Phonetic symbols (Taiwanese and Hakka expansion)
("\ufe10", "\ufe1f"),
("\ufe30", "\ufe4f"),
("\u2600", "\u26ff"),
("\u2700", "\u27bf"),
("\u3200", "\u32ff"),
("\u3300", "\u33ff"),
)
def is_chinese_char(uchar):
for start, end in _UCODE_RANGES:
if start <= uchar <= end:
return True
return False
def chinese_char_tokenize(line):
line = line.strip()
line_in_chars = ""
for char in line:
if is_chinese_char(char):
line_in_chars += " "
line_in_chars += char
line_in_chars += " "
else:
line_in_chars += char
return line_in_chars
# s = '中国的首都是哪里?1,2,3d回答'
# print(chinese_char_tokenize(s))