Upload training_files

Browse files

Files changed (5) hide show

training_files/alpaca-megaset-fixed.json +3 -0
training_files/convert-hf-to-pth-16b.py +109 -0
training_files/convert-hf-to-pth-32b.py +97 -0
training_files/dataset_validator.py +17 -0
training_files/full-training-instructions.txt +65 -0

training_files/alpaca-megaset-fixed.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd16fa0cb1e2402ab5839ec2231ceacf8062070cd750b50b879e74cb16603d3e
+size 30418704

training_files/convert-hf-to-pth-16b.py ADDED Viewed

	@@ -0,0 +1,109 @@

+#Convert hf to pth
+import os
+import json
+import torch
+from transformers import LlamaTokenizer, LlamaForCausalLM
+tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
+base_model = LlamaForCausalLM.from_pretrained(
+    "output_7b",
+    load_in_8bit=False,
+    torch_dtype=torch.float16,
+    device_map={"": "cpu"},
+)
+base_model_sd = base_model.state_dict()
+params = {
+    "dim": 4096,
+    "multiple_of": 256,
+    "n_heads": 32,
+    "n_layers": 32,
+    "norm_eps": 1e-06,
+    "vocab_size": -1,
+}
+n_layers = params["n_layers"]
+n_heads = params["n_heads"]
+dim = params["dim"]
+dims_per_head = dim // n_heads
+base = 10000.0
+inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+def permute(w):
+    return (
+        w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+    )
+def unpermute(w):
+    return (
+        w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
+    )
+def translate_state_dict_key(k):
+    k = k.replace("base_model.model.", "")
+    if k == "model.embed_tokens.weight":
+        return "tok_embeddings.weight"
+    elif k == "model.norm.weight":
+        return "norm.weight"
+    elif k == "lm_head.weight":
+        return "output.weight"
+    elif k.startswith("model.layers."):
+        layer = k.split(".")[2]
+        if k.endswith(".self_attn.q_proj.weight"):
+            return f"layers.{layer}.attention.wq.weight"
+        elif k.endswith(".self_attn.k_proj.weight"):
+            return f"layers.{layer}.attention.wk.weight"
+        elif k.endswith(".self_attn.v_proj.weight"):
+            return f"layers.{layer}.attention.wv.weight"
+        elif k.endswith(".self_attn.o_proj.weight"):
+            return f"layers.{layer}.attention.wo.weight"
+        elif k.endswith(".mlp.gate_proj.weight"):
+            return f"layers.{layer}.feed_forward.w1.weight"
+        elif k.endswith(".mlp.down_proj.weight"):
+            return f"layers.{layer}.feed_forward.w2.weight"
+        elif k.endswith(".mlp.up_proj.weight"):
+            return f"layers.{layer}.feed_forward.w3.weight"
+        elif k.endswith(".input_layernorm.weight"):
+            return f"layers.{layer}.attention_norm.weight"
+        elif k.endswith(".post_attention_layernorm.weight"):
+            return f"layers.{layer}.ffn_norm.weight"
+        elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
+            return None
+        else:
+            print(layer, k)
+            raise NotImplementedError
+    else:
+        print(k)
+        raise NotImplementedError
+new_state_dict = {}
+for k, v in base_model_sd.items():
+    new_k = translate_state_dict_key(k)
+    if new_k is not None:
+        if "wq" in new_k or "wk" in new_k:
+            new_state_dict[new_k] = unpermute(v)
+        else:
+            new_state_dict[new_k] = v
+torch.save(new_state_dict, "consolidated.00.pth")
+with open("params.json", "w") as f:
+    json.dump(params, f)
+#Resize tensors
+model = torch.load("consolidated.00.pth", map_location=torch.device('cpu'))
+x = model["tok_embeddings.weight"]
+y = model["output.weight"]
+row_exclude = 32000
+x = x[:row_exclude]
+y = y[:row_exclude]
+model["tok_embeddings.weight"] = x
+model["output.weight"] = y
+torch.save(model, "consolidated.01.pth")
+#Delete consolidated.00.pth and rename consolidated.01.pth into consolidated.00.pth

training_files/convert-hf-to-pth-32b.py ADDED Viewed

	@@ -0,0 +1,97 @@

+#Convert hf to pth
+import os
+import json
+import torch
+from transformers import LlamaTokenizer, LlamaForCausalLM
+tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
+base_model = LlamaForCausalLM.from_pretrained(
+    "output_7b",
+    load_in_8bit=False,
+    torch_dtype=torch.float16,
+    device_map={"": "cpu"},
+)
+base_model_sd = base_model.state_dict()
+params = {
+    "dim": 4096,
+    "multiple_of": 256,
+    "n_heads": 32,
+    "n_layers": 32,
+    "norm_eps": 1e-06,
+    "vocab_size": -1,
+}
+n_layers = params["n_layers"]
+n_heads = params["n_heads"]
+dim = params["dim"]
+dims_per_head = dim // n_heads
+base = 10000.0
+inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+def permute(w):
+    return (
+        w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+    )
+def unpermute(w):
+    return (
+        w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
+    )
+def translate_state_dict_key(k):
+    k = k.replace("base_model.model.", "")
+    if k == "model.embed_tokens.weight":
+        return "tok_embeddings.weight"
+    elif k == "model.norm.weight":
+        return "norm.weight"
+    elif k == "lm_head.weight":
+        return "output.weight"
+    elif k.startswith("model.layers."):
+        layer = k.split(".")[2]
+        if k.endswith(".self_attn.q_proj.weight"):
+            return f"layers.{layer}.attention.wq.weight"
+        elif k.endswith(".self_attn.k_proj.weight"):
+            return f"layers.{layer}.attention.wk.weight"
+        elif k.endswith(".self_attn.v_proj.weight"):
+            return f"layers.{layer}.attention.wv.weight"
+        elif k.endswith(".self_attn.o_proj.weight"):
+            return f"layers.{layer}.attention.wo.weight"
+        elif k.endswith(".mlp.gate_proj.weight"):
+            return f"layers.{layer}.feed_forward.w1.weight"
+        elif k.endswith(".mlp.down_proj.weight"):
+            return f"layers.{layer}.feed_forward.w2.weight"
+        elif k.endswith(".mlp.up_proj.weight"):
+            return f"layers.{layer}.feed_forward.w3.weight"
+        elif k.endswith(".input_layernorm.weight"):
+            return f"layers.{layer}.attention_norm.weight"
+        elif k.endswith(".post_attention_layernorm.weight"):
+            return f"layers.{layer}.ffn_norm.weight"
+        elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
+            return None
+        else:
+            print(layer, k)
+            raise NotImplementedError
+    else:
+        print(k)
+        raise NotImplementedError
+new_state_dict = {}
+for k, v in base_model_sd.items():
+    new_k = translate_state_dict_key(k)
+    if new_k is not None:
+        if "wq" in new_k or "wk" in new_k:
+            new_state_dict[new_k] = unpermute(v)
+        else:
+            new_state_dict[new_k] = v
+torch.save(new_state_dict, "consolidated.00.pth")
+with open("params.json", "w") as f:
+    json.dump(params, f)

training_files/dataset_validator.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import json
+print("This program will validate the JSON training data.")
+file = input("Enter the file name with extension: ")
+# Load the JSON file
+with open(file, "r", encoding="utf8") as f:
+    data = json.load(f)
+# Check each item in the JSON file
+for item in data:
+    if "instruction" not in item or "input" not in item or "output" not in item:
+        print("Error: Missing key in JSON item.")
+        print(item)
+print("File done. ")

training_files/full-training-instructions.txt ADDED Viewed

	@@ -0,0 +1,65 @@

+wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+enter, enter, yes, defaults
+sudo reboot
+conda activate
+conda create -n alpaca python=3.10
+conda activate alpaca
+export PATH="/home/ubuntu/miniconda3/envs/alpaca/bin:$PATH"
+sudo apt-get install git-lfs
+git lfs install
+git clone https://github.com/tatsu-lab/stanford_alpaca
+git clone https://huggingface.co/decapoda-research/llama-7b-hf
+#remember to edit the tokenizer_config.json from LLaMATokenizer to LlamaTokenizer
+git clone https://huggingface.co/8bit-coder/alpaca-7b-nativeEnhanced
+pip install sentencepiece
+pip install git+https://github.com/huggingface/transformers.git
+cd ./stanford_alpaca
+pip install -r requirements.txt
+cd ..
+torchrun --nproc_per_node=8 --master_port=3045 ./stanford_alpaca/train.py --model_name_or_path ./llama-7b-hf --data_path ./alpaca-7b-nativeEnhanced/training_files/alpaca-megaset-fixed.json --fp16 True --output_dir ./output_7b --num_train_epochs 3 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 16 --evaluation_strategy "no" --save_strategy "steps" --save_steps 200 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 1 --fsdp "full_shard auto_wrap" --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' --tf32 True
+# now, make sure with nano that script1.py has proper paths to everything
+pip install -q datasets loralib sentencepiece
+pip install bitsandbytes
+python script1.py
+git clone https://github.com/antimatter15/alpaca.cpp
+cd alpaca.cpp
+mkdir models
+cd ..
+mv consolidated.01.pth ./alpaca.cpp/models/consolidated.00.pth
+mv params.json ./alpaca.cpp/models/params.json
+mv output_13b/tokenizer.model ./alpaca.cpp/models/tokenizer.model
+cd alpaca.cpp
+make
+cd ..
+python .deez/convert-pth-to-ggml.py ./alpaca.cpp/models 2 (1 for 7b, 2 for 13b, and the rest you can check yourself ;)
+cd alpaca.cpp
+./quantize models/ggml-model-f16.bin ggml-alpaca-13b-nativeEnhanced-q4.bin 2
+there's your finished model!