8bit-coder
commited on
Commit
•
51d42b1
1
Parent(s):
6839937
Upload training_files
Browse files
training_files/alpaca-megaset-fixed.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd16fa0cb1e2402ab5839ec2231ceacf8062070cd750b50b879e74cb16603d3e
|
3 |
+
size 30418704
|
training_files/convert-hf-to-pth-16b.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Convert hf to pth
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from transformers import LlamaTokenizer, LlamaForCausalLM
|
7 |
+
|
8 |
+
tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
|
9 |
+
|
10 |
+
base_model = LlamaForCausalLM.from_pretrained(
|
11 |
+
"output_7b",
|
12 |
+
load_in_8bit=False,
|
13 |
+
torch_dtype=torch.float16,
|
14 |
+
device_map={"": "cpu"},
|
15 |
+
)
|
16 |
+
|
17 |
+
base_model_sd = base_model.state_dict()
|
18 |
+
|
19 |
+
params = {
|
20 |
+
"dim": 4096,
|
21 |
+
"multiple_of": 256,
|
22 |
+
"n_heads": 32,
|
23 |
+
"n_layers": 32,
|
24 |
+
"norm_eps": 1e-06,
|
25 |
+
"vocab_size": -1,
|
26 |
+
}
|
27 |
+
n_layers = params["n_layers"]
|
28 |
+
n_heads = params["n_heads"]
|
29 |
+
dim = params["dim"]
|
30 |
+
dims_per_head = dim // n_heads
|
31 |
+
base = 10000.0
|
32 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
33 |
+
|
34 |
+
|
35 |
+
def permute(w):
|
36 |
+
return (
|
37 |
+
w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
def unpermute(w):
|
42 |
+
return (
|
43 |
+
w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
|
44 |
+
)
|
45 |
+
|
46 |
+
|
47 |
+
def translate_state_dict_key(k):
|
48 |
+
k = k.replace("base_model.model.", "")
|
49 |
+
if k == "model.embed_tokens.weight":
|
50 |
+
return "tok_embeddings.weight"
|
51 |
+
elif k == "model.norm.weight":
|
52 |
+
return "norm.weight"
|
53 |
+
elif k == "lm_head.weight":
|
54 |
+
return "output.weight"
|
55 |
+
elif k.startswith("model.layers."):
|
56 |
+
layer = k.split(".")[2]
|
57 |
+
if k.endswith(".self_attn.q_proj.weight"):
|
58 |
+
return f"layers.{layer}.attention.wq.weight"
|
59 |
+
elif k.endswith(".self_attn.k_proj.weight"):
|
60 |
+
return f"layers.{layer}.attention.wk.weight"
|
61 |
+
elif k.endswith(".self_attn.v_proj.weight"):
|
62 |
+
return f"layers.{layer}.attention.wv.weight"
|
63 |
+
elif k.endswith(".self_attn.o_proj.weight"):
|
64 |
+
return f"layers.{layer}.attention.wo.weight"
|
65 |
+
elif k.endswith(".mlp.gate_proj.weight"):
|
66 |
+
return f"layers.{layer}.feed_forward.w1.weight"
|
67 |
+
elif k.endswith(".mlp.down_proj.weight"):
|
68 |
+
return f"layers.{layer}.feed_forward.w2.weight"
|
69 |
+
elif k.endswith(".mlp.up_proj.weight"):
|
70 |
+
return f"layers.{layer}.feed_forward.w3.weight"
|
71 |
+
elif k.endswith(".input_layernorm.weight"):
|
72 |
+
return f"layers.{layer}.attention_norm.weight"
|
73 |
+
elif k.endswith(".post_attention_layernorm.weight"):
|
74 |
+
return f"layers.{layer}.ffn_norm.weight"
|
75 |
+
elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
|
76 |
+
return None
|
77 |
+
else:
|
78 |
+
print(layer, k)
|
79 |
+
raise NotImplementedError
|
80 |
+
else:
|
81 |
+
print(k)
|
82 |
+
raise NotImplementedError
|
83 |
+
|
84 |
+
|
85 |
+
new_state_dict = {}
|
86 |
+
for k, v in base_model_sd.items():
|
87 |
+
new_k = translate_state_dict_key(k)
|
88 |
+
if new_k is not None:
|
89 |
+
if "wq" in new_k or "wk" in new_k:
|
90 |
+
new_state_dict[new_k] = unpermute(v)
|
91 |
+
else:
|
92 |
+
new_state_dict[new_k] = v
|
93 |
+
|
94 |
+
torch.save(new_state_dict, "consolidated.00.pth")
|
95 |
+
|
96 |
+
with open("params.json", "w") as f:
|
97 |
+
json.dump(params, f)
|
98 |
+
|
99 |
+
#Resize tensors
|
100 |
+
model = torch.load("consolidated.00.pth", map_location=torch.device('cpu'))
|
101 |
+
x = model["tok_embeddings.weight"]
|
102 |
+
y = model["output.weight"]
|
103 |
+
row_exclude = 32000
|
104 |
+
x = x[:row_exclude]
|
105 |
+
y = y[:row_exclude]
|
106 |
+
model["tok_embeddings.weight"] = x
|
107 |
+
model["output.weight"] = y
|
108 |
+
torch.save(model, "consolidated.01.pth")
|
109 |
+
#Delete consolidated.00.pth and rename consolidated.01.pth into consolidated.00.pth
|
training_files/convert-hf-to-pth-32b.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Convert hf to pth
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from transformers import LlamaTokenizer, LlamaForCausalLM
|
7 |
+
|
8 |
+
tokenizer = LlamaTokenizer.from_pretrained("./llama-7b-hf")
|
9 |
+
|
10 |
+
base_model = LlamaForCausalLM.from_pretrained(
|
11 |
+
"output_7b",
|
12 |
+
load_in_8bit=False,
|
13 |
+
torch_dtype=torch.float16,
|
14 |
+
device_map={"": "cpu"},
|
15 |
+
)
|
16 |
+
|
17 |
+
base_model_sd = base_model.state_dict()
|
18 |
+
|
19 |
+
params = {
|
20 |
+
"dim": 4096,
|
21 |
+
"multiple_of": 256,
|
22 |
+
"n_heads": 32,
|
23 |
+
"n_layers": 32,
|
24 |
+
"norm_eps": 1e-06,
|
25 |
+
"vocab_size": -1,
|
26 |
+
}
|
27 |
+
n_layers = params["n_layers"]
|
28 |
+
n_heads = params["n_heads"]
|
29 |
+
dim = params["dim"]
|
30 |
+
dims_per_head = dim // n_heads
|
31 |
+
base = 10000.0
|
32 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
33 |
+
|
34 |
+
|
35 |
+
def permute(w):
|
36 |
+
return (
|
37 |
+
w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
def unpermute(w):
|
42 |
+
return (
|
43 |
+
w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
|
44 |
+
)
|
45 |
+
|
46 |
+
|
47 |
+
def translate_state_dict_key(k):
|
48 |
+
k = k.replace("base_model.model.", "")
|
49 |
+
if k == "model.embed_tokens.weight":
|
50 |
+
return "tok_embeddings.weight"
|
51 |
+
elif k == "model.norm.weight":
|
52 |
+
return "norm.weight"
|
53 |
+
elif k == "lm_head.weight":
|
54 |
+
return "output.weight"
|
55 |
+
elif k.startswith("model.layers."):
|
56 |
+
layer = k.split(".")[2]
|
57 |
+
if k.endswith(".self_attn.q_proj.weight"):
|
58 |
+
return f"layers.{layer}.attention.wq.weight"
|
59 |
+
elif k.endswith(".self_attn.k_proj.weight"):
|
60 |
+
return f"layers.{layer}.attention.wk.weight"
|
61 |
+
elif k.endswith(".self_attn.v_proj.weight"):
|
62 |
+
return f"layers.{layer}.attention.wv.weight"
|
63 |
+
elif k.endswith(".self_attn.o_proj.weight"):
|
64 |
+
return f"layers.{layer}.attention.wo.weight"
|
65 |
+
elif k.endswith(".mlp.gate_proj.weight"):
|
66 |
+
return f"layers.{layer}.feed_forward.w1.weight"
|
67 |
+
elif k.endswith(".mlp.down_proj.weight"):
|
68 |
+
return f"layers.{layer}.feed_forward.w2.weight"
|
69 |
+
elif k.endswith(".mlp.up_proj.weight"):
|
70 |
+
return f"layers.{layer}.feed_forward.w3.weight"
|
71 |
+
elif k.endswith(".input_layernorm.weight"):
|
72 |
+
return f"layers.{layer}.attention_norm.weight"
|
73 |
+
elif k.endswith(".post_attention_layernorm.weight"):
|
74 |
+
return f"layers.{layer}.ffn_norm.weight"
|
75 |
+
elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
|
76 |
+
return None
|
77 |
+
else:
|
78 |
+
print(layer, k)
|
79 |
+
raise NotImplementedError
|
80 |
+
else:
|
81 |
+
print(k)
|
82 |
+
raise NotImplementedError
|
83 |
+
|
84 |
+
|
85 |
+
new_state_dict = {}
|
86 |
+
for k, v in base_model_sd.items():
|
87 |
+
new_k = translate_state_dict_key(k)
|
88 |
+
if new_k is not None:
|
89 |
+
if "wq" in new_k or "wk" in new_k:
|
90 |
+
new_state_dict[new_k] = unpermute(v)
|
91 |
+
else:
|
92 |
+
new_state_dict[new_k] = v
|
93 |
+
|
94 |
+
torch.save(new_state_dict, "consolidated.00.pth")
|
95 |
+
|
96 |
+
with open("params.json", "w") as f:
|
97 |
+
json.dump(params, f)
|
training_files/dataset_validator.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
print("This program will validate the JSON training data.")
|
4 |
+
|
5 |
+
file = input("Enter the file name with extension: ")
|
6 |
+
|
7 |
+
# Load the JSON file
|
8 |
+
with open(file, "r", encoding="utf8") as f:
|
9 |
+
data = json.load(f)
|
10 |
+
|
11 |
+
# Check each item in the JSON file
|
12 |
+
for item in data:
|
13 |
+
if "instruction" not in item or "input" not in item or "output" not in item:
|
14 |
+
print("Error: Missing key in JSON item.")
|
15 |
+
print(item)
|
16 |
+
|
17 |
+
print("File done. ")
|
training_files/full-training-instructions.txt
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
|
2 |
+
|
3 |
+
bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
|
4 |
+
|
5 |
+
enter, enter, yes, defaults
|
6 |
+
|
7 |
+
sudo reboot
|
8 |
+
|
9 |
+
conda activate
|
10 |
+
conda create -n alpaca python=3.10
|
11 |
+
conda activate alpaca
|
12 |
+
|
13 |
+
export PATH="/home/ubuntu/miniconda3/envs/alpaca/bin:$PATH"
|
14 |
+
|
15 |
+
sudo apt-get install git-lfs
|
16 |
+
git lfs install
|
17 |
+
|
18 |
+
git clone https://github.com/tatsu-lab/stanford_alpaca
|
19 |
+
|
20 |
+
git clone https://huggingface.co/decapoda-research/llama-7b-hf
|
21 |
+
#remember to edit the tokenizer_config.json from LLaMATokenizer to LlamaTokenizer
|
22 |
+
|
23 |
+
git clone https://huggingface.co/8bit-coder/alpaca-7b-nativeEnhanced
|
24 |
+
|
25 |
+
pip install sentencepiece
|
26 |
+
pip install git+https://github.com/huggingface/transformers.git
|
27 |
+
|
28 |
+
cd ./stanford_alpaca
|
29 |
+
|
30 |
+
pip install -r requirements.txt
|
31 |
+
|
32 |
+
cd ..
|
33 |
+
|
34 |
+
torchrun --nproc_per_node=8 --master_port=3045 ./stanford_alpaca/train.py --model_name_or_path ./llama-7b-hf --data_path ./alpaca-7b-nativeEnhanced/training_files/alpaca-megaset-fixed.json --fp16 True --output_dir ./output_7b --num_train_epochs 3 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --gradient_accumulation_steps 16 --evaluation_strategy "no" --save_strategy "steps" --save_steps 200 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 1 --fsdp "full_shard auto_wrap" --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' --tf32 True
|
35 |
+
|
36 |
+
# now, make sure with nano that script1.py has proper paths to everything
|
37 |
+
|
38 |
+
pip install -q datasets loralib sentencepiece
|
39 |
+
pip install bitsandbytes
|
40 |
+
|
41 |
+
python script1.py
|
42 |
+
|
43 |
+
git clone https://github.com/antimatter15/alpaca.cpp
|
44 |
+
|
45 |
+
cd alpaca.cpp
|
46 |
+
mkdir models
|
47 |
+
cd ..
|
48 |
+
|
49 |
+
mv consolidated.01.pth ./alpaca.cpp/models/consolidated.00.pth
|
50 |
+
mv params.json ./alpaca.cpp/models/params.json
|
51 |
+
mv output_13b/tokenizer.model ./alpaca.cpp/models/tokenizer.model
|
52 |
+
|
53 |
+
cd alpaca.cpp
|
54 |
+
|
55 |
+
make
|
56 |
+
|
57 |
+
cd ..
|
58 |
+
|
59 |
+
python .deez/convert-pth-to-ggml.py ./alpaca.cpp/models 2 (1 for 7b, 2 for 13b, and the rest you can check yourself ;)
|
60 |
+
|
61 |
+
cd alpaca.cpp
|
62 |
+
|
63 |
+
./quantize models/ggml-model-f16.bin ggml-alpaca-13b-nativeEnhanced-q4.bin 2
|
64 |
+
|
65 |
+
there's your finished model!
|