# coding=utf-8 import jieba import torch def jieba_tokenize(str): return jieba.lcut(str) _UCODE_RANGES = ( ("\u3400", "\u4db5"), # CJK Unified Ideographs Extension A, release 3.0 ("\u4e00", "\u9fa5"), # CJK Unified Ideographs, release 1.1 ("\u9fa6", "\u9fbb"), # CJK Unified Ideographs, release 4.1 ("\uf900", "\ufa2d"), # CJK Compatibility Ideographs, release 1.1 ("\ufa30", "\ufa6a"), # CJK Compatibility Ideographs, release 3.2 ("\ufa70", "\ufad9"), # CJK Compatibility Ideographs, release 4.1 ("\u20000", "\u2a6d6"), # (UTF16) CJK Unified Ideographs Extension B, release 3.1 ("\u2f800", "\u2fa1d"), # (UTF16) CJK Compatibility Supplement, release 3.1 ("\uff00", "\uffef"), # Full width ASCII, full width of English punctuation, # half width Katakana, half wide half width kana, Korean alphabet ("\u2e80", "\u2eff"), # CJK Radicals Supplement ("\u3000", "\u303f"), # CJK punctuation mark ("\u31c0", "\u31ef"), # CJK stroke ("\u2f00", "\u2fdf"), # Kangxi Radicals ("\u2ff0", "\u2fff"), # Chinese character structure ("\u3100", "\u312f"), # Phonetic symbols ("\u31a0", "\u31bf"), # Phonetic symbols (Taiwanese and Hakka expansion) ("\ufe10", "\ufe1f"), ("\ufe30", "\ufe4f"), ("\u2600", "\u26ff"), ("\u2700", "\u27bf"), ("\u3200", "\u32ff"), ("\u3300", "\u33ff"), ) def is_chinese_char(uchar): for start, end in _UCODE_RANGES: if start <= uchar <= end: return True return False def chinese_char_tokenize(line): line = line.strip() line_in_chars = "" for char in line: if is_chinese_char(char): line_in_chars += " " line_in_chars += char line_in_chars += " " else: line_in_chars += char return line_in_chars # s = '中国的首都是哪里?1,2,3d回答' # print(chinese_char_tokenize(s)) def report_memory(name): """Simple GPU memory report.""" mega_bytes = 1024.0 * 1024.0 string = name + ' memory (MB)' string += ' | allocated: {}'.format( torch.cuda.memory_allocated() / mega_bytes) string += ' | max allocated: {}'.format( torch.cuda.max_memory_allocated() / mega_bytes) string += ' | reserved: {}'.format( torch.cuda.memory_reserved() / mega_bytes) string += ' | max reserved: {}'.format( torch.cuda.max_memory_reserved() / mega_bytes) print(string)