import gc from datasets import load_dataset from transformers import PreTrainedTokenizerFast from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.processors import TemplateProcessing def batch_iterator(): # text dataset = ( load_dataset('saillab/taco-datasets', data_dir=data_dir, split='train') for data_dir in [ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4', 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k', ] ) for d in dataset: for row in d: for n in row: yield row['instruction'] + '\n' + row['input'] + '\n' + row['output'] del dataset gc.collect() # text dataset = ( load_dataset('xu-song/cc100-samples', lang, split='train') for lang in [ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu', ] ) for d in dataset: for row in d['text']: yield row del dataset gc.collect() # text dataset = load_dataset('JeanKaddour/minipile', split='train+validation+test') for row in dataset: yield row['text'] del dataset gc.collect() # code dataset = load_dataset('bigcode/programming-languages-keywords', split='train') for row in dataset: for n in row['keywords']: yield n del dataset gc.collect() # code dataset = ( load_dataset('bigcode/the-stack-smol-xs', lang, split='train', trust_remote_code=True) for lang in [ 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly', 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c', 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp', 'css', 'cuda', 'dart', 'dockerfile', 'elixir', 'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go', 'groovy', 'haskell','html', 'idris', 'isabelle', 'java', 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean', 'literate-agda', 'literate-coffeescript', 'literate-haskell', 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab', 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog', 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext', 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme', 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan', 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex', 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt', 'yacc', 'zig', ] ) for d in dataset: for row in d: yield row['content'] del dataset gc.collect() # math dataset = load_dataset('gair-prox/open-web-math-pro', split='train') for row in dataset: yield row['text'] del dataset gc.collect() # text dataset = load_dataset('JeanKaddour/minipile', split='train+validation+test') for row in dataset: yield row['text'] del dataset gc.collect() bpe = BPE(unk_token=None, fuse_unk=False, byte_fallback=False, ignore_merges=True) tokenizer = Tokenizer(bpe) special_tokens = [ '', '', '', '<|im_start|>', '<|im_end|>', 'system', 'user', 'assistant', 'resource', 'tool', 'agent', # tool/function calling '', '', '', '', '', '', '"arguments"', '"name"', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # misc '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # qa '', '', '', '', '', '', '', '', # cot, tot, got '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # reasoning '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # reflection '', '', '', '', '', '', '', '', '', '', ] for i in range(2, 25): special_tokens.append(' ' * i) for i in range(128 - len(special_tokens)): special_tokens.append(f'<|reserved_{i}|>') # programming languages dataset = load_dataset('Tanvir1337/programming-languages', split='train') programming_languages = [n for row in dataset for n in row['text']] del dataset # programming languages keywords dataset = load_dataset('bigcode/programming-languages-keywords', split='train') code_keywords = [n for row in dataset for n in row['keywords']] del dataset tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) tokenizer.post_processor = TemplateProcessing( single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences pair='$A:0 $B:1', # For pairs, we specify type IDs for both tokens special_tokens=[], ) tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) trainer = BpeTrainer( vocab_size=32768, # 32 * 1024 min_frequency=2, special_tokens=special_tokens, initial_alphabet=programming_languages + code_keywords, ) tokenizer.train_from_iterator(batch_iterator(), trainer) tokenizer.save('../tokenizer.json') tokenizer.model.save('../') CHATML_CHAT_TEMPLATE = ( "{% for message in messages %}" "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" "{% endfor %}" "{% if add_generation_prompt %}" "{{ '<|im_start|>assistant\n' }}" "{% endif %}" ) fast_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, chat_template=CHATML_CHAT_TEMPLATE, bos_token='', eos_token='', unk_token='', pad_token='', clean_up_tokenization_spaces=False, ) fast_tokenizer.save_pretrained('../')