Spaces:
Runtime error
Runtime error
# coding=utf-8 | |
import jieba | |
def jieba_tokenize(str): | |
return jieba.lcut(str) | |
_UCODE_RANGES = ( | |
("\u3400", "\u4db5"), # CJK Unified Ideographs Extension A, release 3.0 | |
("\u4e00", "\u9fa5"), # CJK Unified Ideographs, release 1.1 | |
("\u9fa6", "\u9fbb"), # CJK Unified Ideographs, release 4.1 | |
("\uf900", "\ufa2d"), # CJK Compatibility Ideographs, release 1.1 | |
("\ufa30", "\ufa6a"), # CJK Compatibility Ideographs, release 3.2 | |
("\ufa70", "\ufad9"), # CJK Compatibility Ideographs, release 4.1 | |
("\u20000", "\u2a6d6"), # (UTF16) CJK Unified Ideographs Extension B, release 3.1 | |
("\u2f800", "\u2fa1d"), # (UTF16) CJK Compatibility Supplement, release 3.1 | |
("\uff00", "\uffef"), # Full width ASCII, full width of English punctuation, | |
# half width Katakana, half wide half width kana, Korean alphabet | |
("\u2e80", "\u2eff"), # CJK Radicals Supplement | |
("\u3000", "\u303f"), # CJK punctuation mark | |
("\u31c0", "\u31ef"), # CJK stroke | |
("\u2f00", "\u2fdf"), # Kangxi Radicals | |
("\u2ff0", "\u2fff"), # Chinese character structure | |
("\u3100", "\u312f"), # Phonetic symbols | |
("\u31a0", "\u31bf"), # Phonetic symbols (Taiwanese and Hakka expansion) | |
("\ufe10", "\ufe1f"), | |
("\ufe30", "\ufe4f"), | |
("\u2600", "\u26ff"), | |
("\u2700", "\u27bf"), | |
("\u3200", "\u32ff"), | |
("\u3300", "\u33ff"), | |
) | |
def is_chinese_char(uchar): | |
for start, end in _UCODE_RANGES: | |
if start <= uchar <= end: | |
return True | |
return False | |
def chinese_char_tokenize(line): | |
line = line.strip() | |
line_in_chars = "" | |
for char in line: | |
if is_chinese_char(char): | |
line_in_chars += " " | |
line_in_chars += char | |
line_in_chars += " " | |
else: | |
line_in_chars += char | |
return line_in_chars | |
# s = '中国的首都是哪里?1,2,3d回答' | |
# print(chinese_char_tokenize(s)) | |