Heralax commited on
Commit
f6e290f
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.gguf filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
Philosophy-Llm-Mistral-Pretrain-7.2B-F16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7516fb94f1870e169599cff6608e3549b32add432019f63f142c951cf4220010
3
+ size 14484749152
README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Heralax/philosophy-llm-mistral-pretrain
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: philosophy-hardcore-pretraining
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.1`
19
+ ```yaml
20
+ # This is an axolotl config that allowed creation of a model knowledgeable about hawaii.
21
+ # Replace the dataset paths under `datasets:` with your own
22
+ # If you want a reference point of what kind of data was fed into this model, check out hawaiitoolkit https://github.com/e-p-armstrong/hawaiitoolkit.git
23
+
24
+ # Rent a GPU with a compute provider like Vast.ai or Runpod
25
+ # (Make sure it is using the axolotl docker image --- winglian/axolotl:main-latest)
26
+ # Copy this file over to the rented instance, in the /workspace/axolotl directory
27
+ # If running on a single-GPU setup, you must run:
28
+ # conda install -c conda-forge mpi4py mpich
29
+ # Then run this command from the /workspace/axolotl directory:
30
+ # accelerate launch --use_deepspeed -m axolotl.cli.train axolotl_config_hawaii_llama3_Jun_9_2024.yaml
31
+
32
+ # If using GaLore, do not use deepspeed
33
+
34
+ # (to copy files over to a rented GPU instance, you'll have to use SSH to Secure CoPy files over from your machine to the rented one. This is what such a command might look like, adapt it to your needs)
35
+ # scp -P 40001 -r ./ root@173.231.62.170:/workspace/axolotl/
36
+
37
+
38
+ # TODO to properly make this great, MAKE VARIED SYSTEM PROMPTS FOR ALL THINGS IN THE hawaii DATASET.
39
+ # And make automated code to produce it so that I built it for this project and not the other one.
40
+ # OK, now I am truly back to working on the efficiency problem.
41
+
42
+ base_model: Heralax/philosophy-llm-mistral-pretrain
43
+ tokenizer_type: AutoTokenizer
44
+ is_mistral_derived_model: true
45
+ load_in_8bit: false
46
+ load_in_4bit: false
47
+ strict: false
48
+
49
+ datasets:
50
+ - path: json
51
+ data_files: philosophy_qa_normal.jsonl
52
+ ds_type: json
53
+ type: sharegpt
54
+ conversation: chatml
55
+ - path: json
56
+ data_files: philosophy_qa_open-ended.jsonl
57
+ ds_type: json
58
+ type: sharegpt
59
+ conversation: chatml
60
+ - path: json
61
+ data_files: philosophy_qa_negative.jsonl
62
+ ds_type: json
63
+ type: sharegpt
64
+ conversation: chatml
65
+
66
+ dataset_prepared_path: last_run_prepared
67
+ output_dir: ./philosophy-hardcore-pretraining
68
+
69
+ sequence_len: 4096
70
+ sample_packing: false
71
+ pad_to_sequence_len: true
72
+ shuffle_merged_datasets: true
73
+
74
+ wandb_project: mistral-philosophy
75
+ wandb_entity:
76
+ wandb_watch:
77
+ wandb_run_id:
78
+ wandb_log_model:
79
+
80
+ gradient_accumulation_steps: 6
81
+ micro_batch_size: 2
82
+ eval_batch_size: 1
83
+ num_epochs: 6
84
+ optimizer: paged_adamw_8bit
85
+ lr_scheduler: cosine
86
+ learning_rate: 0.000020
87
+ weight_decay: 0
88
+ # Gradient clipping max norm
89
+ max_grad_norm: 1.0
90
+ noisy_embedding_alpha: 0
91
+ train_on_inputs: false
92
+ group_by_length: false
93
+ bf16: true
94
+ fp16: false
95
+ tf32: false
96
+
97
+ gradient_checkpointing: unsloth
98
+ early_stopping_patience:
99
+ resume_from_checkpoint:
100
+ logging_steps: 1
101
+ xformers_attention:
102
+ flash_attention: true
103
+
104
+ chat_template: chatml
105
+
106
+ warmup_ratio: 0.5
107
+ auto_resume_from_checkpoints: false
108
+ #warmup_ratio: 0.5
109
+ eval_steps: 10
110
+ saves_per_epoch: 1
111
+ eval_sample_packing: false
112
+ save_total_limit: 3
113
+ debug:
114
+ deepspeed: deepspeed_configs/zero2.json
115
+ special_tokens:
116
+ pad_token: "<|end_of_text|>"
117
+ ```
118
+
119
+ </details><br>
120
+
121
+ # philosophy-hardcore-pretraining
122
+
123
+ This model is a fine-tuned version of [Heralax/philosophy-llm-mistral-pretrain](https://huggingface.co/Heralax/philosophy-llm-mistral-pretrain) on the None dataset.
124
+
125
+ ## Model description
126
+
127
+ More information needed
128
+
129
+ ## Intended uses & limitations
130
+
131
+ More information needed
132
+
133
+ ## Training and evaluation data
134
+
135
+ More information needed
136
+
137
+ ## Training procedure
138
+
139
+ ### Training hyperparameters
140
+
141
+ The following hyperparameters were used during training:
142
+ - learning_rate: 2e-05
143
+ - train_batch_size: 2
144
+ - eval_batch_size: 1
145
+ - seed: 42
146
+ - distributed_type: multi-GPU
147
+ - num_devices: 6
148
+ - gradient_accumulation_steps: 6
149
+ - total_train_batch_size: 72
150
+ - total_eval_batch_size: 6
151
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
152
+ - lr_scheduler_type: cosine
153
+ - lr_scheduler_warmup_steps: 136
154
+ - num_epochs: 6
155
+
156
+ ### Training results
157
+
158
+
159
+
160
+ ### Framework versions
161
+
162
+ - Transformers 4.45.0.dev0
163
+ - Pytorch 2.3.1+cu121
164
+ - Datasets 2.21.0
165
+ - Tokenizers 0.19.1
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|end_of_text|>": 32000
3
+ }
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Heralax/philosophy-llm-mistral-pretrain",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 32768,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-05,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.45.0.dev0",
25
+ "use_cache": false,
26
+ "vocab_size": 32001
27
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": 2,
6
+ "transformers_version": "4.45.0.dev0"
7
+ }
ggml-model-Q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:764aca9ee943237a5d882b17f91db0947fe0d6c1dc20e965b5e657808050f07a
3
+ size 7695867232
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b3e1e7e7f61f0abeafef4f98b2bec3dac4e272402bebd22740ff5be95fafbe5
3
+ size 14483521198
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end_of_text|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
Binary file (493 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<|end_of_text|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "<s>",
40
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
41
+ "clean_up_tokenization_spaces": false,
42
+ "eos_token": "</s>",
43
+ "legacy": true,
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "<|end_of_text|>",
46
+ "sp_model_kwargs": {},
47
+ "spaces_between_special_tokens": false,
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "<unk>",
50
+ "use_default_system_prompt": false
51
+ }