Spaces:
Runtime error
Runtime error
from functools import lru_cache | |
def bytes_to_unicode_dict(): | |
""" | |
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control | |
characters the bpe code barfs on. | |
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab | |
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for | |
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup | |
tables between utf-8 bytes and unicode strings. | |
""" | |
bs = ( | |
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) | |
) | |
cs = bs[:] | |
n = 0 | |
for b in range(2 ** 8): | |
if b not in bs: | |
bs.append(b) | |
cs.append(2 ** 8 + n) | |
n += 1 | |
cs = [chr(n) for n in cs] | |
return dict(zip(cs, bs)) | |
ORD_UNICODE_MAP = bytes_to_unicode_dict() | |
def byte_to_char(bytestr): | |
return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace") | |
# @lru_cache() | |
def bytetokens_to_unicdode(byte_tokens: list): | |
return [byte_to_char(token) for token in byte_tokens] | |
if __name__ == '__main__': | |
tokens = ['<s>', | |
'ì¹´ì¹´ìĺ¤', | |
'ìĹĶ', | |
'íĦ°', | |
'íĶĦëĿ¼ìĿ´', | |
'ì¦Ī', | |
'(', | |
'ëĮĢíijľ', | |
'Ġë°±', | |
'ìĥģ', | |
'ìĹ½', | |
')', | |
'ê°Ģ', | |
'Ġìĺ¬íķ´', | |
'Ġ8', | |
'ìĽĶ', | |
'Ġ기ì¤Ģ', | |
'Ġëĭ¤ìĪĺ', | |
'Ġê¶Į', | |
'ìľĦ', | |
'ĠìŀĪëĬĶ', | |
'Ġê¸Ģë¡ľë²Į', | |
'ĠíķĻ', | |
'íļĮìĹIJìĦľ', | |
'Ġì´Ŀ', | |
'Ġ16', | |
'ê±´', | |
'ìĿĺ', | |
'ĠìĿ¸ê³µ', | |
'ì§Ģ', | |
'ëĬ¥', | |
'(', | |
'A', | |
'I', | |
')', | |
'Ġëħ¼ë¬¸', | |
'ìĿĦ', | |
'Ġëĵ±', | |
'ìŀ¬', | |
'íĸĪëĭ¤ê³ł', | |
'Ġ9', | |
'ìĿ¼', | |
'Ġë°ĿíĺĶ', | |
'ëĭ¤', | |
'.', | |
'Ġì§ĢëĤľíķ´', | |
'Ġëĵ±', | |
'ìŀ¬', | |
'íķľ', | |
'Ġ13', | |
'ê±´ë', | |
'³´ëĭ¤', | |
'Ġ3', | |
'ê±´', | |
'Ġë§İìĿĢ', | |
'Ġëħ¼ë¬¸', | |
'ìĿ´', | |
'Ġë°ĺ', | |
'ëħĦ', | |
'ìŬ', | |
'Ġë§ĮìĹIJ', | |
'Ġì±Ħ', | |
'íĥĿ', | |
'ëIJIJëĭ¤', | |
'.', | |
'Ġì¹´ì¹´ìĺ¤', | |
'ìĹĶ', | |
'íĦ°', | |
'íĶĦëĿ¼ìĿ´', | |
'ì¦Ī', | |
'(', | |
'ìĿ´', | |
'íķĺ', | |
'Ġì¹´ì¹´ìĺ¤', | |
'ìĹĶ', | |
'íĦ°', | |
')', | |
'ëĬĶ', | |
'ĠA', | |
'I', | |
'ĠìĹ°êµ¬', | |
'ĠìĦ±', | |
'과를', | |
'ĠìĿ´', | |
'ìĸ´ê°Ģ', | |
'기', | |
'ĠìľĦíķ´', | |
'ĠìĿ¸ìŀ¬', | |
'ĠíĻķë³´', | |
'ìĹIJ', | |
'ĠìĨį', | |
'ëıĦ를', | |
'ĠëĨĴìĿ´', | |
'ê²łëĭ¤ëĬĶ', | |
'Ġë°©', | |
'침', | |
'ìĿ´ëĭ¤', | |
'.', | |
'Ċ', | |
'Ċ', | |
'ì¹´ì¹´ìĺ¤', | |
'ìĹĶ', | |
'íĦ°', | |
'ëĬĶ', | |
'Ġ8', | |
'ìĽĶ', | |
'ĠìŀIJìĹ°', | |
'ìĸ´', | |
'ì²ĺ리', | |
'Ġë¶Ħìķ¼', | |
'ìĿĺ', | |
'Ġê¸Ģë¡ľë²Į', | |
'Ġíĥij', | |
'ĠíķĻ', | |
'íļĮ', | |
'ìĿ¸', | |
"Ġ'", | |
'A', | |
'C', | |
'L', | |
'-', | |
'I', | |
'J', | |
'C', | |
'N', | |
'L', | |
'P', | |
"'", | |
'ìĹIJ', | |
'Ġëħ¼ë¬¸', | |
'ìĿĦ', | |
'Ġë°ľíijľ', | |
'íķľ', | |
'ĠìĤ¬ë¡Ģ', | |
'ê¹Įì§Ģ', | |
'Ġíķ©', | |
'íķ´', | |
'Ġìĺ¬íķ´', | |
'Ġì´Ŀ', | |
'Ġ16', | |
'ê±´', | |
'ìĿĺ', | |
'ĠA', | |
'I', | |
'Ġëħ¼ë¬¸', | |
'ìĿĦ', | |
'Ġëĵ±', | |
'ìŀ¬', | |
'íĸĪëĭ¤ê³ł', | |
'Ġë°ĿíĺĶ', | |
'ëĭ¤', | |
'.', | |
'ĠìĿ´', | |
'Ġëħ¼ë¬¸', | |
'ìĿĢ', | |
'ĠìĿ¸ëıĦ', | |
'ë©Ķ', | |
'ìĿ¸', | |
'(', | |
'in', | |
'-', | |
'd', | |
'om', | |
'a', | |
'in', | |
')', | |
'Ġìĥĺ', | |
'íĶĮ', | |
'ìĿĦ', | |
'ĠìĤ¬ìļ©', | |
'íķ´', | |
'ĠìŀIJìĹ°', | |
'ìĸ´', | |
'Ġ공격', | |
'Ġë°©ìĭĿìľ¼ë¡ľ', | |
'ĠìķĦìĽĥ', | |
'ìĺ¤', | |
'ë¸Į', | |
'ëıĦ', | |
'ë©Ķ', | |
'ìĿ¸', | |
'(', | |
'out', | |
'-', | |
'of', | |
'-', | |
'd', | |
'om', | |
'a', | |
'in', | |
')', | |
'Ġìĥĺ', | |
'íĶĮ', | |
'ìĿĦ', | |
'ĠìŀIJëıĻ', | |
'ìľ¼ë¡ľ', | |
'ĠìĥĿ', | |
'ìĦ±', | |
',', | |
'Ġë¶Ħ', | |
'ë¥ĺ', | |
'Ġ모ëį¸', | |
'ìĿĺ', | |
'Ġê°IJ', | |
'ì§Ģ', | |
'ĠëĬ¥ëł¥ìĿĦ', | |
'Ġíĸ¥', | |
'ìĥģ', | |
'ìĭľíĤ¤ëĬĶ', | |
'ĠëĤ´ìļ©', | |
'ìĿĺ', | |
'Ġëħ¼ë¬¸', | |
'ìĿ´ëĭ¤', | |
'.', | |
'Ċ', | |
'Ċ', | |
'7', | |
'ìĽĶ', | |
'ìĹIJëĬĶ', | |
'Ġ머', | |
'ìĭł', | |
'룬', | |
'ëĭĿ', | |
'ĠíķĻ', | |
'íļĮ', | |
"Ġ'", | |
'I', | |
'C', | |
'M', | |
'L', | |
"'", | |
'ìĹIJ', | |
'Ġíļ¨ìľ¨', | |
'ìłģìĿ¸', | |
'Ġê³ł', | |
'íĴĪ', | |
'ì§Ī', | |
'ĠìĿĮ', | |
'ìĦ±', | |
'íķ©', | |
'ìĦ±ìĿ´', | |
'Ġê°ĢëĬ¥íķľ', | |
"Ġ'", | |
'ìĹĶ', | |
'ëĵľ', | |
'ĠíĪ¬', | |
'ĠìĹĶ', | |
'ëĵľ', | |
'(', | |
'en', | |
'd', | |
'-', | |
't', | |
'o', | |
'-', | |
'en', | |
'd', | |
')', | |
"'", | |
'Ġ모ëį¸', | |
'ìĿĦ', | |
'ĠìłľìķĪ', | |
'íķĺëĬĶ', | |
'Ġëħ¼ë¬¸', | |
'ìĿĦ', | |
'Ġë°ľíijľ', | |
'íĸĪëĭ¤', | |
'.', | |
'Ġ6', | |
'ìĽĶ', | |
'ìĹIJëĬĶ', | |
'ĠìĿĮ', | |
'íĸ¥', | |
'·', | |
'ìĿĮ', | |
'ìĦ±', | |
'Ġìĭł', | |
'íĺ¸', | |
'ì²ĺ리', | |
'Ġë¶Ħìķ¼', | |
'ĠíķĻ', | |
'ìĪł', | |
'ëĮĢíļĮ', | |
"Ġ'", | |
'I', | |
'C', | |
'A', | |
'S', | |
'S', | |
'P', | |
"'", | |
'ìĹIJ', | |
'ĠëĮĢ', | |
'ê·ľëª¨', | |
'Ġíħ', | |
'į', | |
'ìĬ¤íĬ¸', | |
'Ġì½Ķ', | |
'íį¼ìĬ¤', | |
'(', | |
'ìĸ¸', | |
'ìĸ´', | |
'ĠìĹ°', | |
'구를', | |
'ĠìľĦíķ´', | |
'Ġíħ', | |
'į', | |
'ìĬ¤íĬ¸ë¥¼', | |
'Ġì»´íĵ¨íĦ°', | |
'ê°Ģ', | |
'ĠìĿ½ìĿĦ', | |
'ĠìĪĺ', | |
'ĠìŀĪëĬĶ', | |
'Ġíĺķíĥľë¡ľ', | |
'Ġ모ìķĦ', | |
'ĠëĨĵìĿĢ', | |
'Ġìĸ¸ìĸ´', | |
'ĠìŀIJë£Į', | |
')', | |
'Ġìłķë³´', | |
'ĠíķĻìĬµ', | |
'ìĹIJ', | |
'ĠëĮĢíķľ', | |
'Ġëħ¼ë¬¸', | |
'Ġ1', | |
'ê±´ìĿĦ', | |
'Ġìĭ¤', | |
'ìĹĪëĭ¤', | |
'.', | |
'Ċ', | |
'</s>'] | |
import time | |
start = time.time() | |
for i in range(1000): | |
result = bytetokens_to_unicdode(tokens) | |
end = time.time() | |
print(result) | |
print(f'time: {end-start}') |