--- license: apache-2.0 --- # mistralアーキテクチャを使った日本語LLM(0.3B) # 学習環境 * A5000 × 7 # 学習パラメータ ## hf_config.json ```json { "model_type": "gpt2", "config_name":"gpt2-medium" , "tokenizer_name":"/home/ubuntu/slm/spm_tokenizer_neologdn_bytefallback_nofast" , "train_file":"../wiki.txt", "validation_split_percentage":5, "output_dir":"checkpoints-mistral-300M-FA2-3", "do_train":true, "do_eval":true, "prediction_loss_only":true, "remove_unused_columns":false , "learning_rate":3.0e-4 , "weight_decay":0.1 , "adam_beta2":0.95 , "num_train_epochs":10, "logging_dir":"checkpoints-mistral-300M-FA2-3/logs", "logging_strategy": "steps" , "logging_steps":10 , "evaluation_strategy":"steps" , "save_strategy": "steps" , "eval_steps":500 , "save_steps":500 , "load_best_model_at_end":true , "save_total_limit":10 , "warmup_steps":4 , "lr_scheduler_type":"cosine" , "per_device_train_batch_size":8, "per_device_eval_batch_size":8, "block_size":1024 , "adam_epsilon":1.0e-4 , "fp16":true , "gradient_accumulation_steps":16, "push_to_hub":false, "dataloader_num_workers": 8, "optim":"adamw_bnb_8bit" , "torch_compile":true } ``` ## モデルパラメータ ```json { "architectures": [ "MistralForCausalLM" ], "bos_token_id": 0, "eos_token_id": 0, "hidden_act": "silu", "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": 2400, "max_position_embeddings": 4096, "model_type": "mistral", "num_attention_heads": 16, "num_hidden_layers": 24, "num_key_value_heads": 8, "rms_norm_eps": 1e-05, "rope_theta": 10000.0, "sliding_window": 1024, "tie_word_embeddings": false, "torch_dtype": "float16", "transformers_version": "4.35.2", "use_cache": true, "vocab_size": 50257 } ``` ## deepspeedのパラメータ ```json { "fp16": { "enabled": "auto", "loss_scale": 0.0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 1, "min_loss_scale": 0 }, "bf16": { "enabled": "auto" }, "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupDecayLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto", "total_num_steps": "auto" } }, "zero_optimization": { "stage": 0, "allgather_partitions": true, "allgather_bucket_size": 2e6, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 2e6, "contiguous_gradients": true, "round_robin_gradients":true }, "dump_state": true, "comms_logger": { "enabled": true, "verbose": false, "prof_all": true, "debug": false }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto" } ``` # TensorBoardの結果 **eval/loss** ![image/png](https://cdn-uploads.huggingface.co/production/uploads/63ed9019679c2cc40abaf971/5p3B4eQAF552UL7GQsAi4.png) **train/learning_rate** ![image/png](https://cdn-uploads.huggingface.co/production/uploads/63ed9019679c2cc40abaf971/UAvWXVLsMrjc-J-e9vFIL.png) **train/loss** ![image/png](https://cdn-uploads.huggingface.co/production/uploads/63ed9019679c2cc40abaf971/OM3Nen3PBEc_PTsXMqbHs.png) # モデルの推論コード ```python from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer import torch MODEL_NAME = "./pretrain/checkpoints-mistral-300M-FA2-3/checkpoint-12000/" torch.set_float32_matmul_precision('high') DEVICE = "cuda" if torch.cuda.is_available(): print("cuda") DEVICE = "cuda" else: print("cpu") DEVICE = "cpu" # DEVICE = "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, ).to(DEVICE) # streamer = TextStreamer(tokenizer) prompt = "大規模言語モデルとは、" inputs = tokenizer(prompt, add_special_tokens=False,return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( inputs["input_ids"], max_new_tokens=100, do_sample=True, early_stopping=False, top_p=0.95, top_k=50, temperature=0.9, # streamer=streamer, no_repeat_ngram_size=2, num_beams=3 ) print(outputs.tolist()[0]) outputs_txt = tokenizer.decode(outputs[0]) print(outputs_txt) prompt = "まどマギで一番可愛いキャラは、" inputs = tokenizer(prompt, add_special_tokens=False,return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( inputs["input_ids"], max_new_tokens=100, do_sample=True, early_stopping=False, top_p=0.95, top_k=50, temperature=0.9, # streamer=streamer, no_repeat_ngram_size=2, num_beams=3 ) print(outputs.tolist()[0]) outputs_txt = tokenizer.decode(outputs[0]) print(outputs_txt) ``` # 備考 「ローカルLLMに向き合う会」が主催する[LOCAL AI HACKATHON](https://prtimes.jp/main/html/rd/p/000000007.000056944.html)にてリソースをお借りして処理を行いました