Spaces:
Runtime error
Runtime error
File size: 1,911 Bytes
50f0fbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# coding=utf-8
import jieba
def jieba_tokenize(str):
return jieba.lcut(str)
_UCODE_RANGES = (
("\u3400", "\u4db5"), # CJK Unified Ideographs Extension A, release 3.0
("\u4e00", "\u9fa5"), # CJK Unified Ideographs, release 1.1
("\u9fa6", "\u9fbb"), # CJK Unified Ideographs, release 4.1
("\uf900", "\ufa2d"), # CJK Compatibility Ideographs, release 1.1
("\ufa30", "\ufa6a"), # CJK Compatibility Ideographs, release 3.2
("\ufa70", "\ufad9"), # CJK Compatibility Ideographs, release 4.1
("\u20000", "\u2a6d6"), # (UTF16) CJK Unified Ideographs Extension B, release 3.1
("\u2f800", "\u2fa1d"), # (UTF16) CJK Compatibility Supplement, release 3.1
("\uff00", "\uffef"), # Full width ASCII, full width of English punctuation,
# half width Katakana, half wide half width kana, Korean alphabet
("\u2e80", "\u2eff"), # CJK Radicals Supplement
("\u3000", "\u303f"), # CJK punctuation mark
("\u31c0", "\u31ef"), # CJK stroke
("\u2f00", "\u2fdf"), # Kangxi Radicals
("\u2ff0", "\u2fff"), # Chinese character structure
("\u3100", "\u312f"), # Phonetic symbols
("\u31a0", "\u31bf"), # Phonetic symbols (Taiwanese and Hakka expansion)
("\ufe10", "\ufe1f"),
("\ufe30", "\ufe4f"),
("\u2600", "\u26ff"),
("\u2700", "\u27bf"),
("\u3200", "\u32ff"),
("\u3300", "\u33ff"),
)
def is_chinese_char(uchar):
for start, end in _UCODE_RANGES:
if start <= uchar <= end:
return True
return False
def chinese_char_tokenize(line):
line = line.strip()
line_in_chars = ""
for char in line:
if is_chinese_char(char):
line_in_chars += " "
line_in_chars += char
line_in_chars += " "
else:
line_in_chars += char
return line_in_chars
# s = '中国的首都是哪里?1,2,3d回答'
# print(chinese_char_tokenize(s))
|