Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

dutch-tokenizer-arena / examples.py

xu-song

add more tokenizers

f4973d4 11 months ago

raw

history blame

2.37 kB

	examples = {
	"en": [
	["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"], # chatglm 有blank_n,
	# ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
	["punctuations: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
	["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
	["digits: (10086 + 98) = 100184", "baichuan", "llama"]
	]
	,
	"zh": [
	["空格测试： 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
	["标点测试：，。！？；", "baichuan_7b", "llama"],
	["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
	["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
	["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
	]

	}

	more_examples = [
	# bert VS clue
	# bert系列
	("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug， bert VS clue

	# llama系列 (基于sentencepiece)
	("baichuan", "baichuan2", "baichuan2支持多空格，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
	("llama", "baichuan2", "baichuan2支持多空格，多个换行\n\n"),
	("llama", "chinese_llama2", ""),
	("chinese_llama", "chinese_llama2", ""),

	# glm系列（基于sentencepiece）
	("glm", "chatglm1", ""),
	("chatglm1", "chatglm2", ""),

	# gpt2系列
	("gpt2", "moss", ""),
	("", "", ""),

	# openai系列（tiktoken）
	("qwen", "gpt_35_turbo", ""),


	]


	def example_fn(example_idx):
	return examples["en"][example_idx]


	def get_more_example():
	import urllib.parse
	url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
	for tokenizer1, tokenizer2, text in more_examples:
	full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
	print(full_url)


	if __name__ == "__main__":
	get_more_example()