mistralアーキテクチャを使った日本語LLM(0.3B)

学習環境

  • A5000 × 7

学習パラメータ

hf_config.json

{
    "model_type": "gpt2",
    "config_name":"gpt2-medium" ,
    "tokenizer_name":"/home/ubuntu/slm/spm_tokenizer_neologdn_bytefallback_nofast" ,
    "train_file":"../wiki.txt",
    "validation_split_percentage":5,
    "output_dir":"checkpoints-mistral-300M-FA2-3",
    "do_train":true,
    "do_eval":true,
    "prediction_loss_only":true,
    "remove_unused_columns":false ,
    "learning_rate":3.0e-4 ,
    "weight_decay":0.1 ,
    "adam_beta2":0.95 ,
    "num_train_epochs":10,
    "logging_dir":"checkpoints-mistral-300M-FA2-3/logs",
    "logging_strategy": "steps" ,
    "logging_steps":10 ,
    "evaluation_strategy":"steps" ,
    "save_strategy": "steps" ,
    "eval_steps":500 ,
    "save_steps":500 ,
    "load_best_model_at_end":true ,
    "save_total_limit":10 ,
    "warmup_steps":4 ,
    "lr_scheduler_type":"cosine" ,
    "per_device_train_batch_size":8,
    "per_device_eval_batch_size":8,
    "block_size":1024 ,
    "adam_epsilon":1.0e-4 ,
    "fp16":true ,
    "gradient_accumulation_steps":16,
    "push_to_hub":false,
    "dataloader_num_workers": 8,
    "optim":"adamw_bnb_8bit" ,
    "torch_compile":true
}

モデルパラメータ

{
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 0,
  "eos_token_id": 0,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 2400,
  "max_position_embeddings": 4096,
  "model_type": "mistral",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 1024,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.35.2",
  "use_cache": true,
  "vocab_size": 50257
}

deepspeedのパラメータ

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0.0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 1,
        "min_loss_scale": 0
    },
    
    "bf16": {
        "enabled": "auto"
    },
    

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto",
            "total_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 0,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e6,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e6,
        "contiguous_gradients": true,
        "round_robin_gradients":true
    },

    "dump_state": true,
    "comms_logger": {
      "enabled": true,
      "verbose": false,
      "prof_all": true,
      "debug": false
    },
    
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"

 
}

TensorBoardの結果

eval/loss

image/png

train/learning_rate

image/png

train/loss

image/png

モデルの推論コード

from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch

MODEL_NAME = "./pretrain/checkpoints-mistral-300M-FA2-3/checkpoint-12000/"
torch.set_float32_matmul_precision('high')

DEVICE = "cuda"
if torch.cuda.is_available():
    print("cuda")
    DEVICE = "cuda"
else:
    print("cpu")
    DEVICE = "cpu"
# DEVICE = "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
).to(DEVICE)

# streamer = TextStreamer(tokenizer)

prompt = "大規模言語モデルとは、"


inputs = tokenizer(prompt, add_special_tokens=False,return_tensors="pt").to(model.device)
with torch.no_grad():

    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
        do_sample=True,
        early_stopping=False,
        top_p=0.95,
        top_k=50,
        temperature=0.9,
        # streamer=streamer,
        no_repeat_ngram_size=2,
        num_beams=3
    )

print(outputs.tolist()[0])
outputs_txt = tokenizer.decode(outputs[0])
print(outputs_txt)


prompt = "まどマギで一番可愛いキャラは、"


inputs = tokenizer(prompt, add_special_tokens=False,return_tensors="pt").to(model.device)
with torch.no_grad():

    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
        do_sample=True,
        early_stopping=False,
        top_p=0.95,
        top_k=50,
        temperature=0.9,
        # streamer=streamer,
        no_repeat_ngram_size=2,
        num_beams=3
    )

print(outputs.tolist()[0])
outputs_txt = tokenizer.decode(outputs[0])
print(outputs_txt)

備考

「ローカルLLMに向き合う会」が主催するLOCAL AI HACKATHONにてリソースをお借りして処理を行いました

Downloads last month
11
Safetensors
Model size
355M params
Tensor type
F32
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.