kevin-yang
initial commit
b1944b2
raw
history blame
7.96 kB
from functools import lru_cache
@lru_cache()
def bytes_to_unicode_dict():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2 ** 8):
if b not in bs:
bs.append(b)
cs.append(2 ** 8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(cs, bs))
ORD_UNICODE_MAP = bytes_to_unicode_dict()
@lru_cache()
def byte_to_char(bytestr):
return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")
# @lru_cache()
def bytetokens_to_unicdode(byte_tokens: list):
return [byte_to_char(token) for token in byte_tokens]
if __name__ == '__main__':
tokens = ['<s>',
'ì¹´ì¹´ìĺ¤',
'ìĹĶ',
'íĦ°',
'íĶĦëĿ¼ìĿ´',
'ì¦Ī',
'(',
'ëĮĢíijľ',
'Ġë°±',
'ìĥģ',
'ìĹ½',
')',
'ê°Ģ',
'Ġìĺ¬íķ´',
'Ġ8',
'ìĽĶ',
'Ġ기ì¤Ģ',
'Ġëĭ¤ìĪĺ',
'Ġê¶Į',
'ìľĦ',
'ĠìŀĪëĬĶ',
'Ġê¸Ģë¡ľë²Į',
'ĠíķĻ',
'íļĮìĹIJìĦľ',
'Ġì´Ŀ',
'Ġ16',
'ê±´',
'ìĿĺ',
'ĠìĿ¸ê³µ',
'ì§Ģ',
'ëĬ¥',
'(',
'A',
'I',
')',
'Ġëħ¼ë¬¸',
'ìĿĦ',
'Ġëĵ±',
'ìŀ¬',
'íĸĪëĭ¤ê³ł',
'Ġ9',
'ìĿ¼',
'Ġë°ĿíĺĶ',
'ëĭ¤',
'.',
'Ġì§ĢëĤľíķ´',
'Ġëĵ±',
'ìŀ¬',
'íķľ',
'Ġ13',
'ê±´ë',
'³´ëĭ¤',
'Ġ3',
'ê±´',
'Ġë§İìĿĢ',
'Ġëħ¼ë¬¸',
'ìĿ´',
'Ġë°ĺ',
'ëħĦ',
'ìŬ',
'Ġë§ĮìĹIJ',
'Ġì±Ħ',
'íĥĿ',
'ëIJIJëĭ¤',
'.',
'Ġì¹´ì¹´ìĺ¤',
'ìĹĶ',
'íĦ°',
'íĶĦëĿ¼ìĿ´',
'ì¦Ī',
'(',
'ìĿ´',
'íķĺ',
'Ġì¹´ì¹´ìĺ¤',
'ìĹĶ',
'íĦ°',
')',
'ëĬĶ',
'ĠA',
'I',
'ĠìĹ°êµ¬',
'ĠìĦ±',
'과를',
'ĠìĿ´',
'ìĸ´ê°Ģ',
'기',
'ĠìľĦíķ´',
'ĠìĿ¸ìŀ¬',
'ĠíĻķë³´',
'ìĹIJ',
'ĠìĨį',
'ëıĦ를',
'ĠëĨĴìĿ´',
'ê²łëĭ¤ëĬĶ',
'Ġë°©',
'침',
'ìĿ´ëĭ¤',
'.',
'Ċ',
'Ċ',
'ì¹´ì¹´ìĺ¤',
'ìĹĶ',
'íĦ°',
'ëĬĶ',
'Ġ8',
'ìĽĶ',
'ĠìŀIJìĹ°',
'ìĸ´',
'ì²ĺ리',
'Ġë¶Ħìķ¼',
'ìĿĺ',
'Ġê¸Ģë¡ľë²Į',
'Ġíĥij',
'ĠíķĻ',
'íļĮ',
'ìĿ¸',
"Ġ'",
'A',
'C',
'L',
'-',
'I',
'J',
'C',
'N',
'L',
'P',
"'",
'ìĹIJ',
'Ġëħ¼ë¬¸',
'ìĿĦ',
'Ġë°ľíijľ',
'íķľ',
'ĠìĤ¬ë¡Ģ',
'ê¹Įì§Ģ',
'Ġíķ©',
'íķ´',
'Ġìĺ¬íķ´',
'Ġì´Ŀ',
'Ġ16',
'ê±´',
'ìĿĺ',
'ĠA',
'I',
'Ġëħ¼ë¬¸',
'ìĿĦ',
'Ġëĵ±',
'ìŀ¬',
'íĸĪëĭ¤ê³ł',
'Ġë°ĿíĺĶ',
'ëĭ¤',
'.',
'ĠìĿ´',
'Ġëħ¼ë¬¸',
'ìĿĢ',
'ĠìĿ¸ëıĦ',
'ë©Ķ',
'ìĿ¸',
'(',
'in',
'-',
'd',
'om',
'a',
'in',
')',
'Ġìĥĺ',
'íĶĮ',
'ìĿĦ',
'ĠìĤ¬ìļ©',
'íķ´',
'ĠìŀIJìĹ°',
'ìĸ´',
'Ġ공격',
'Ġë°©ìĭĿìľ¼ë¡ľ',
'ĠìķĦìĽĥ',
'ìĺ¤',
'ë¸Į',
'ëıĦ',
'ë©Ķ',
'ìĿ¸',
'(',
'out',
'-',
'of',
'-',
'd',
'om',
'a',
'in',
')',
'Ġìĥĺ',
'íĶĮ',
'ìĿĦ',
'ĠìŀIJëıĻ',
'ìľ¼ë¡ľ',
'ĠìĥĿ',
'ìĦ±',
',',
'Ġë¶Ħ',
'ë¥ĺ',
'Ġ모ëį¸',
'ìĿĺ',
'Ġê°IJ',
'ì§Ģ',
'ĠëĬ¥ëł¥ìĿĦ',
'Ġíĸ¥',
'ìĥģ',
'ìĭľíĤ¤ëĬĶ',
'ĠëĤ´ìļ©',
'ìĿĺ',
'Ġëħ¼ë¬¸',
'ìĿ´ëĭ¤',
'.',
'Ċ',
'Ċ',
'7',
'ìĽĶ',
'ìĹIJëĬĶ',
'Ġ머',
'ìĭł',
'룬',
'ëĭĿ',
'ĠíķĻ',
'íļĮ',
"Ġ'",
'I',
'C',
'M',
'L',
"'",
'ìĹIJ',
'Ġíļ¨ìľ¨',
'ìłģìĿ¸',
'Ġê³ł',
'íĴĪ',
'ì§Ī',
'ĠìĿĮ',
'ìĦ±',
'íķ©',
'ìĦ±ìĿ´',
'Ġê°ĢëĬ¥íķľ',
"Ġ'",
'ìĹĶ',
'ëĵľ',
'ĠíĪ¬',
'ĠìĹĶ',
'ëĵľ',
'(',
'en',
'd',
'-',
't',
'o',
'-',
'en',
'd',
')',
"'",
'Ġ모ëį¸',
'ìĿĦ',
'ĠìłľìķĪ',
'íķĺëĬĶ',
'Ġëħ¼ë¬¸',
'ìĿĦ',
'Ġë°ľíijľ',
'íĸĪëĭ¤',
'.',
'Ġ6',
'ìĽĶ',
'ìĹIJëĬĶ',
'ĠìĿĮ',
'íĸ¥',
'·',
'ìĿĮ',
'ìĦ±',
'Ġìĭł',
'íĺ¸',
'ì²ĺ리',
'Ġë¶Ħìķ¼',
'ĠíķĻ',
'ìĪł',
'ëĮĢíļĮ',
"Ġ'",
'I',
'C',
'A',
'S',
'S',
'P',
"'",
'ìĹIJ',
'ĠëĮĢ',
'ê·ľëª¨',
'Ġíħ',
'į',
'ìĬ¤íĬ¸',
'Ġì½Ķ',
'íį¼ìĬ¤',
'(',
'ìĸ¸',
'ìĸ´',
'ĠìĹ°',
'구를',
'ĠìľĦíķ´',
'Ġíħ',
'į',
'ìĬ¤íĬ¸ë¥¼',
'Ġì»´íĵ¨íĦ°',
'ê°Ģ',
'ĠìĿ½ìĿĦ',
'ĠìĪĺ',
'ĠìŀĪëĬĶ',
'Ġíĺķíĥľë¡ľ',
'Ġ모ìķĦ',
'ĠëĨĵìĿĢ',
'Ġìĸ¸ìĸ´',
'ĠìŀIJë£Į',
')',
'Ġìłķë³´',
'ĠíķĻìĬµ',
'ìĹIJ',
'ĠëĮĢíķľ',
'Ġëħ¼ë¬¸',
'Ġ1',
'ê±´ìĿĦ',
'Ġìĭ¤',
'ìĹĪëĭ¤',
'.',
'Ċ',
'</s>']
import time
start = time.time()
for i in range(1000):
result = bytetokens_to_unicdode(tokens)
end = time.time()
print(result)
print(f'time: {end-start}')