WaveCut commited on
Commit
a6c1b3f
1 Parent(s): d0aa094

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 3.5bpw/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ 5bpw/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ 6.5bpw/tokenizer.json filter=lfs diff=lfs merge=lfs -text
3.5bpw/.gitattributes ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model-00004-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
37
+ model-00005-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
38
+ model-00001-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
39
+ model-00002-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
40
+ model-00003-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
41
+ orig_adapter/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
42
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
3.5bpw/README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ru
4
+ datasets:
5
+ - IlyaGusev/saiga_scored
6
+ - IlyaGusev/saiga_preferences
7
+ license: apache-2.0
8
+ ---
9
+
10
+
11
+ # Saiga/MistralNemo 12B, Russian fine-tune of Mistral Nemo
12
+
13
+ Based on [an abliterated version](https://huggingface.co/natong19/Mistral-Nemo-Instruct-2407-abliterated) of [Mistral Nemo](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407).
14
+
15
+ Llama.cpp version: TBD
16
+
17
+ Colab: [link](https://colab.research.google.com/drive/1vNzMyPqx2GB7zk3ANDtZEfvhzgYOWu0B)
18
+
19
+ ## Prompt format
20
+
21
+ Original Misral Nemo prompt format, but the system prompt is in the beginning:
22
+ ```
23
+ <s>Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им.
24
+ [INST]Как дела?[/INST]
25
+ Отлично, а у тебя?</s>
26
+ [INST]Шикарно. Как пройти в библиотеку?[/INST]
27
+ ```
28
+
29
+ ## Code example
30
+ ```python
31
+ # Исключительно ознакомительный пример.
32
+ # НЕ НАДО ТАК ИНФЕРИТЬ МОДЕЛЬ В ПРОДЕ.
33
+ # См. https://github.com/vllm-project/vllm или https://github.com/huggingface/text-generation-inference
34
+
35
+ import torch
36
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
37
+
38
+ MODEL_NAME = "IlyaGusev/saiga_nemo_12b"
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ MODEL_NAME,
42
+ load_in_8bit=True,
43
+ torch_dtype=torch.bfloat16,
44
+ device_map="auto"
45
+ )
46
+ model.eval()
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
49
+ generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
50
+ print(generation_config)
51
+
52
+ inputs = ["Почему трава зеленая?", "Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч"]
53
+ for query in inputs:
54
+ prompt = tokenizer.apply_chat_template([{
55
+ "role": "user",
56
+ "content": query
57
+ }], tokenize=False, add_generation_prompt=True)
58
+ data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
59
+ data = {k: v.to(model.device) for k, v in data.items()}
60
+ data.pop("token_type_ids", None)
61
+ output_ids = model.generate(**data, generation_config=generation_config)[0]
62
+ output_ids = output_ids[len(data["input_ids"][0]):]
63
+ output = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
64
+ print(query)
65
+ print(output)
66
+ print()
67
+ print("==============================")
68
+ print()
69
+ ```
70
+
71
+ ## Output examples
72
+ ```
73
+ User: Почему трава зеленая?
74
+ Saiga: Трава зеленая по нескольким основным причинам:
75
+ 1. **Хлорофилл**: Основной причиной зеленого цвета травы является наличие хлорофилла, пигмента в клетках растений, который играет ключевую роль в процессе фотосинтеза. Хлорофилл поглощает световые волны синего и красного спектра, а отражение зелёных волн обуславливает именно этот цвет.
76
+ 2. **Процесс фотосинтеза**: Хлорофилл не только придаёт растениям зелёный окрас, но и обеспечивает процесс фотосинтеза, благодаря которому растение превращает солнечный свет, углекислый газ и воду в питательные вещества (в основном глюкозу), кислород и энергию. Этот процесс необходим для жизни растений и их роста.
77
+ 3. **Адаптация к окружающей среде**: Зеленый цвет оптимален для максимального поглощения света в условиях на Земле. Наша атмосфера рассеивает больше синего и красного света, чем зелёного, что делает его наиболее доступным источником света для растений на поверхности планеты.
78
+ 4. **Взаимодействие с окружающими условиями**: При различных условиях (например, недостатке воды или изменении уровня освещённости) трава может менять интенсивность своего зелёного цвета, иногда приобретая желтизну из-за уменьшения содержания хлорофилла или коричневый оттенок из-за образования других пигментов, таких как каротиноиды и антоцианы.
79
+ Таким образом, зеленый цвет травы — это результат эволюционной адаптации растений к земным условиям, определяемый прежде всего присутствием хлорофилла и необходимостью эффективно использовать солнечный свет в процессе фотосинтеза.
80
+ ```
81
+
82
+ ```
83
+ User: Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч
84
+ Saiga: ### Сказка о Тане и чудесной встрече
85
+ В далеком пригороде, где улицы были пронизаны зеленью деревьев, а дома казались игрушками в огромной коробке, жила девочка по имени Таня. Её волосы цвета осенней листвы развевались на ветру, а глаза сверкали яркостью летнего неба. И была у неё одна особенность — любовь к мячу. Этот мяч был не просто предметом для игры; он был её верным спутником, другом и даже союзником во всех приключениях.
86
+ Мяч этот, с замысловатым узором из разноцветных полос, словно обладал собственной жизнью. Он мог подсказать, когда пора отправиться на поиски новых друзей или рассказать истории о далеких странах, которые отражались в его шелковистой поверхности.
87
+ Однажды утром, когда первые лучи солнца только начинали пробиваться сквозь листву, Таня решила провести день необычно. Она взяла свой волшебный мяч и направилась в сторону старого парка, что находился на краю города, там, где дороги становились менее знакомыми, а воздух наполнялся загадочными запахами. Мяч пружинил в руках, будто предвкушая что-то особенное.
88
+ Парк встретил её тишиной и полусонными тенистыми аллеями. Таня побежала, играя со своим мячом, который словно летел впереди, маня все дальше. Вдруг, посреди поляны, мяч вырвался из рук, взмыл вверх и... замер, зависнув над землей! Таня остолбенела. Мелькнуло воспоминание о детских сказках, где обыденные вещи вдруг обретали магические свойства.
89
+ — Это сон? — шептала она, делая шаг навстречу парителю.
90
+ В то же мгновение, когда мяч начал медленно снижаться, из-за кустов показалось существо, которое можно было бы принять за маленького человека-животного. Его одежда переливалась всеми цветами радуги, как и узоры на мяче Тани. Существо оказалось Лесным Хранителем — добрым духом парка, хранителем всех тайн и чудес.
91
+ — Здравствуй, Таня, — сказал Хранитель, голос его звучал как мелодичное журчание ручья. — Я ждал тебя. Твой мяч — не просто игрушка, это ключ, открывающий сердца. Ты обладаешь даром видеть мир иначе, и я здесь, чтобы научить тебя этим силам.
92
+ Таня, хоть и испугалась сначала, но любопытство и искреннее желание понять новое быстро преодолели страх. Вместе они начали путешествие по волшебному миру парка. Мяч теперь светился, указывая дорогу, и каждый раз, когда он касался земли, вокруг оживали старые легенды: танцевали светлячки, разговаривали деревья, а птицы пели песни забытых эпох.
93
+ Вскоре Таня узнала, что каждый цвет на её мяче символизировал силу: синий — мудрость, красный — смелость, зелёный — связь с природой, а белый — чистоту намерений. Используя эти силы, она помогала обитателям парка: спасла заблудившихся детей, исцелила больную птицу и даже примирила два враждующих духа-деревьев.
94
+ Когда сумерки опустились на парк, Лесной Хранитель вернулся к своей вековой задаче, но перед этим подарил Танье маленький семечко:
95
+ — Посади его рядом с твоим домом, и пусть растет вместе с тобой. Каждый раз, глядя на него, ты будешь помнить о мире, где всё возможно.
96
+ На следующий день Таня вернулась домой, держа в руках не только любимый мяч, но и новый источник магии. Семя посадили, и вскоре оно превратилось в прекрасное дерево, расцветающее разноцветными цветами — словно продолжение самого мяча. Теперь каждая встреча с ним напоминала о тех приключениях и тайнах, что живут среди нас, если знать, куда и как смотреть.
97
+ И так, благодаря своему верному мячу, Таня стала хранительницей маленьких чудес в своём городе, доказывая всем, что волшебство может быть самым обычным предметом, лишь бы сердце было открыто для приключений и добра.
98
+ ```
99
+
100
+
101
+ ## Versions
102
+ v1:
103
+ - [87a83ce252ff0142cd4cc918fb3e6a9875ca4638](https://huggingface.co/IlyaGusev/saiga_nemo_12b/commit/87a83ce252ff0142cd4cc918fb3e6a9875ca4638)
104
+ - Other name: saiga_nemo_12b_sft_m9_d14_simpo_m19_d31
105
+ - SFT dataset config: [sft_d14.json](https://github.com/IlyaGusev/saiga/blob/main/configs/datasets/sft_d14.json)
106
+ - SFT model config: [saiga_nemo_12b_sft_m9.json](https://github.com/IlyaGusev/saiga/blob/main/configs/models/saiga_nemo_12b_sft_m9.json)
107
+ - SimPO dataset config: [pref_d31.json](https://github.com/IlyaGusev/saiga/blob/main/configs/datasets/pref_d31.json)
108
+ - SimPO model config: [saiga_nemo_12b_simpo_m19.json](https://github.com/IlyaGusev/saiga/blob/main/configs/models/saiga_nemo_12b_simpo_m19.json)
109
+ - SFT wandb: [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/e74ozfzh)
110
+ - SimPO wandb: [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/b094iiej)
111
+
112
+
113
+ ## Evaluation
114
+
115
+ RuArenaHard:
116
+
117
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fc2346dea82dd667bb0ffbc/-uG--3Wu9oUi9_bC_ZFP4.png)
118
+
119
+
120
+ PingPong:
121
+
122
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fc2346dea82dd667bb0ffbc/uNmD2YhealySO6UYUH8-g.png)
123
+
3.5bpw/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/saiga_nemo_12b_sft_m9_d14",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 5120,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 8192,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 40,
18
+ "num_key_value_heads": 8,
19
+ "pad_token_id": 10,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.45.1",
26
+ "unsloth_version": "2024.8",
27
+ "use_cache": true,
28
+ "vocab_size": 131072,
29
+ "quantization_config": {
30
+ "quant_method": "exl2",
31
+ "version": "0.2.3",
32
+ "bits": 3.5,
33
+ "head_bits": 8,
34
+ "calibration": {
35
+ "rows": 100,
36
+ "length": 2048,
37
+ "dataset": "toxicator-ru"
38
+ }
39
+ }
40
+ }
3.5bpw/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 10,
6
+ "max_length": 1024000,
7
+ "max_new_tokens": 2048,
8
+ "temperature": 0.6,
9
+ "top_p": 0.95,
10
+ "repetition_penalty": 1.1,
11
+ "do_sample": true,
12
+ "transformers_version": "4.45.1"
13
+ }
3.5bpw/model.safetensors.index.json ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 24495564800
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00005-of-00005.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00005.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00005.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00005.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00005.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00005.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00005.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00004-of-00005.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00005.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00005.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00005.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00004-of-00005.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00005.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00005.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00005.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
242
+ "model.layers.32.input_layernorm.weight": "model-00004-of-00005.safetensors",
243
+ "model.layers.32.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
244
+ "model.layers.32.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
245
+ "model.layers.32.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
246
+ "model.layers.32.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
247
+ "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
248
+ "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
249
+ "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
250
+ "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
251
+ "model.layers.33.input_layernorm.weight": "model-00005-of-00005.safetensors",
252
+ "model.layers.33.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
253
+ "model.layers.33.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
254
+ "model.layers.33.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
255
+ "model.layers.33.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
256
+ "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
257
+ "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
258
+ "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
259
+ "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
260
+ "model.layers.34.input_layernorm.weight": "model-00005-of-00005.safetensors",
261
+ "model.layers.34.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
262
+ "model.layers.34.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
263
+ "model.layers.34.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
264
+ "model.layers.34.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
265
+ "model.layers.34.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
266
+ "model.layers.34.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
267
+ "model.layers.34.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
268
+ "model.layers.34.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
269
+ "model.layers.35.input_layernorm.weight": "model-00005-of-00005.safetensors",
270
+ "model.layers.35.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
271
+ "model.layers.35.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
272
+ "model.layers.35.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
273
+ "model.layers.35.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
274
+ "model.layers.35.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
275
+ "model.layers.35.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
276
+ "model.layers.35.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
277
+ "model.layers.35.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
278
+ "model.layers.36.input_layernorm.weight": "model-00005-of-00005.safetensors",
279
+ "model.layers.36.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
280
+ "model.layers.36.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
281
+ "model.layers.36.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
282
+ "model.layers.36.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
283
+ "model.layers.36.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
284
+ "model.layers.36.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
285
+ "model.layers.36.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
286
+ "model.layers.36.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
287
+ "model.layers.37.input_layernorm.weight": "model-00005-of-00005.safetensors",
288
+ "model.layers.37.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
289
+ "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
290
+ "model.layers.37.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
291
+ "model.layers.37.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
292
+ "model.layers.37.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
293
+ "model.layers.37.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
294
+ "model.layers.37.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
295
+ "model.layers.37.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
296
+ "model.layers.38.input_layernorm.weight": "model-00005-of-00005.safetensors",
297
+ "model.layers.38.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
298
+ "model.layers.38.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
299
+ "model.layers.38.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
300
+ "model.layers.38.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
301
+ "model.layers.38.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
302
+ "model.layers.38.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
303
+ "model.layers.38.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
304
+ "model.layers.38.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
305
+ "model.layers.39.input_layernorm.weight": "model-00005-of-00005.safetensors",
306
+ "model.layers.39.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
307
+ "model.layers.39.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
308
+ "model.layers.39.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
309
+ "model.layers.39.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
310
+ "model.layers.39.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
311
+ "model.layers.39.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
312
+ "model.layers.39.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
313
+ "model.layers.39.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
314
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
315
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
316
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
317
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
318
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
319
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
320
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
321
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
322
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
323
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
324
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
325
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
326
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
327
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
328
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
329
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
330
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
331
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
332
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00005.safetensors",
333
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
334
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
335
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
336
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
337
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
338
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
339
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
340
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
341
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00005.safetensors",
342
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
343
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
344
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
345
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
346
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
347
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
348
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
349
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
350
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
351
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
352
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
353
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
354
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
355
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
356
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
357
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
358
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
359
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
360
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
361
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
362
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
363
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
364
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
365
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
366
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
367
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
368
+ "model.norm.weight": "model-00005-of-00005.safetensors"
369
+ }
370
+ }
3.5bpw/output.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1aaad76054b7e33df35898f47081570ffdddaba70e6028f6588003dc255be5cd
3
+ size 6787831596
3.5bpw/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ }
12
+ }
3.5bpw/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0240ce510f08e6c2041724e9043e33be9d251d1e4a4d94eb68cd47b954b61d2
3
+ size 17078292
3.5bpw/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
5bpw/.gitattributes ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model-00004-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
37
+ model-00005-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
38
+ model-00001-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
39
+ model-00002-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
40
+ model-00003-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
41
+ orig_adapter/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
42
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
5bpw/README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ru
4
+ datasets:
5
+ - IlyaGusev/saiga_scored
6
+ - IlyaGusev/saiga_preferences
7
+ license: apache-2.0
8
+ ---
9
+
10
+
11
+ # Saiga/MistralNemo 12B, Russian fine-tune of Mistral Nemo
12
+
13
+ Based on [an abliterated version](https://huggingface.co/natong19/Mistral-Nemo-Instruct-2407-abliterated) of [Mistral Nemo](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407).
14
+
15
+ Llama.cpp version: TBD
16
+
17
+ Colab: [link](https://colab.research.google.com/drive/1vNzMyPqx2GB7zk3ANDtZEfvhzgYOWu0B)
18
+
19
+ ## Prompt format
20
+
21
+ Original Misral Nemo prompt format, but the system prompt is in the beginning:
22
+ ```
23
+ <s>Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им.
24
+ [INST]Как дела?[/INST]
25
+ Отлично, а у тебя?</s>
26
+ [INST]Шикарно. Как пройти в библиотеку?[/INST]
27
+ ```
28
+
29
+ ## Code example
30
+ ```python
31
+ # Исключительно ознакомительный пример.
32
+ # НЕ НАДО ТАК ИНФЕРИТЬ МОДЕЛЬ В ПРОДЕ.
33
+ # См. https://github.com/vllm-project/vllm или https://github.com/huggingface/text-generation-inference
34
+
35
+ import torch
36
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
37
+
38
+ MODEL_NAME = "IlyaGusev/saiga_nemo_12b"
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ MODEL_NAME,
42
+ load_in_8bit=True,
43
+ torch_dtype=torch.bfloat16,
44
+ device_map="auto"
45
+ )
46
+ model.eval()
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
49
+ generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
50
+ print(generation_config)
51
+
52
+ inputs = ["Почему трава зеленая?", "Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч"]
53
+ for query in inputs:
54
+ prompt = tokenizer.apply_chat_template([{
55
+ "role": "user",
56
+ "content": query
57
+ }], tokenize=False, add_generation_prompt=True)
58
+ data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
59
+ data = {k: v.to(model.device) for k, v in data.items()}
60
+ data.pop("token_type_ids", None)
61
+ output_ids = model.generate(**data, generation_config=generation_config)[0]
62
+ output_ids = output_ids[len(data["input_ids"][0]):]
63
+ output = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
64
+ print(query)
65
+ print(output)
66
+ print()
67
+ print("==============================")
68
+ print()
69
+ ```
70
+
71
+ ## Output examples
72
+ ```
73
+ User: Почему трава зеленая?
74
+ Saiga: Трава зеленая по нескольким основным причинам:
75
+ 1. **Хлорофилл**: Основной причиной зеленого цвета травы является наличие хлорофилла, пигмента в клетках растений, который играет ключевую роль в процессе фотосинтеза. Хлорофилл поглощает световые волны синего и красного спектра, а отражение зелёных волн обуславливает именно этот цвет.
76
+ 2. **Процесс фотосинтеза**: Хлорофилл не только придаёт растениям зелёный окрас, но и обеспечивает процесс фотосинтеза, благодаря которому растение превращает солнечный свет, углекислый газ и воду в питательные вещества (в основном глюкозу), кислород и энергию. Этот процесс необходим для жизни растений и их роста.
77
+ 3. **Адаптация к окружающей среде**: Зеленый цвет оптимален для максимального поглощения света в условиях на Земле. Наша атмосфера рассеивает больше синего и красного света, чем зелёного, что делает его наиболее доступным источником света для растений на поверхности планеты.
78
+ 4. **Взаимодействие с окружающими условиями**: При различных условиях (например, недостатке воды или изменении уровня освещённости) трава может менять интенсивность своего зелёного цвета, иногда приобретая желтизну из-за уменьшения содержания хлорофилла или коричневый оттенок из-за образования других пигментов, таких как каротиноиды и антоцианы.
79
+ Таким образом, зеленый цвет травы — это результат эволюционной адаптации растений к земным условиям, определяемый прежде всего присутствием хлорофилла и необходимостью эффективно использовать солнечный свет в процессе фотосинтеза.
80
+ ```
81
+
82
+ ```
83
+ User: Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч
84
+ Saiga: ### Сказка о Тане и чудесной встрече
85
+ В далеком пригороде, где улицы были пронизаны зеленью деревьев, а дома казались игрушками в огромной коробке, жила девочка по имени Таня. Её волосы цвета осенней листвы развевались на ветру, а глаза сверкали яркостью летнего неба. И была у неё одна особенность — любовь к мячу. Этот мяч был не просто предметом для игры; он был её верным спутником, другом и даже союзником во всех приключениях.
86
+ Мяч этот, с замысловатым узором из разноцветных полос, словно обладал собственной жизнью. Он мог подсказать, когда пора отправиться на поиски новых друзей или рассказать истории о далеких странах, которые отражались в его шелковистой поверхности.
87
+ Однажды утром, когда первые лучи солнца только начинали пробиваться сквозь листву, Таня решила провести день необычно. Она взяла свой волшебный мяч и направилась в сторону старого парка, что находился на краю города, там, где дороги становились менее знакомыми, а воздух наполнялся загадочными запахами. Мяч пружинил в руках, будто предвкушая что-то особенное.
88
+ Парк встретил её тишиной и полусонными тенистыми аллеями. Таня побежала, играя со своим мячом, который словно летел впереди, маня все дальше. Вдруг, посреди поляны, мяч вырвался из рук, взмыл вверх и... замер, зависнув над землей! Таня остолбенела. Мелькнуло воспоминание о детских сказках, где обыденные вещи вдруг обретали магические свойства.
89
+ — Это сон? — шептала она, делая шаг навстречу парителю.
90
+ В то же мгновение, когда мяч начал медленно снижаться, из-за кустов показалось существо, которое можно было бы принять за маленького человека-животного. Его одежда переливалась всеми цветами радуги, как и узоры на мяче Тани. Существо оказалось Лесным Хранителем — добрым духом парка, хранителем всех тайн и чудес.
91
+ — Здравствуй, Таня, — сказал Хранитель, голос его звучал как мелодичное журчание ручья. — Я ждал тебя. Твой мяч — не просто игрушка, это ключ, открывающий сердца. Ты обладаешь даром видеть мир иначе, и я здесь, чтобы научить тебя этим силам.
92
+ Таня, хоть и испугалась сначала, но любопытство и искреннее желание понять новое быстро преодолели страх. Вместе они начали путешествие по волшебному миру парка. Мяч теперь светился, указывая дорогу, и каждый раз, когда он касался земли, вокруг оживали старые легенды: танцевали светлячки, разговаривали деревья, а птицы пели песни забытых эпох.
93
+ Вскоре Таня узнала, что каждый цвет на её мяче символизировал силу: синий — мудрость, красный — смелость, зелёный — связь с природой, а белый — чистоту намерений. Используя эти силы, она помогала обитателям парка: спасла заблудившихся детей, исцелила больную птицу и даже примирила два враждующих духа-деревьев.
94
+ Когда сумерки опустились на парк, Лесной Хранитель вернулся к своей вековой задаче, но перед этим подарил Танье маленький семечко:
95
+ — Посади его рядом с твоим домом, и пусть растет вместе с тобой. Каждый раз, глядя на него, ты будешь помнить о мире, где всё возможно.
96
+ На следующий день Таня вернулась домой, держа в руках не только любимый мяч, но и новый источник магии. Семя посадили, и вскоре оно превратилось в прекрасное дерево, расцветающее разноцветными цветами — словно продолжение самого мяча. Теперь каждая встреча с ним напоминала о тех приключениях и тайнах, что живут среди нас, если знать, куда и как смотреть.
97
+ И так, благодаря своему верному мячу, Таня стала хранительницей маленьких чудес в своём городе, доказывая всем, что волшебство может быть самым обычным предметом, лишь бы сердце было открыто для приключений и добра.
98
+ ```
99
+
100
+
101
+ ## Versions
102
+ v1:
103
+ - [87a83ce252ff0142cd4cc918fb3e6a9875ca4638](https://huggingface.co/IlyaGusev/saiga_nemo_12b/commit/87a83ce252ff0142cd4cc918fb3e6a9875ca4638)
104
+ - Other name: saiga_nemo_12b_sft_m9_d14_simpo_m19_d31
105
+ - SFT dataset config: [sft_d14.json](https://github.com/IlyaGusev/saiga/blob/main/configs/datasets/sft_d14.json)
106
+ - SFT model config: [saiga_nemo_12b_sft_m9.json](https://github.com/IlyaGusev/saiga/blob/main/configs/models/saiga_nemo_12b_sft_m9.json)
107
+ - SimPO dataset config: [pref_d31.json](https://github.com/IlyaGusev/saiga/blob/main/configs/datasets/pref_d31.json)
108
+ - SimPO model config: [saiga_nemo_12b_simpo_m19.json](https://github.com/IlyaGusev/saiga/blob/main/configs/models/saiga_nemo_12b_simpo_m19.json)
109
+ - SFT wandb: [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/e74ozfzh)
110
+ - SimPO wandb: [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/b094iiej)
111
+
112
+
113
+ ## Evaluation
114
+
115
+ RuArenaHard:
116
+
117
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fc2346dea82dd667bb0ffbc/-uG--3Wu9oUi9_bC_ZFP4.png)
118
+
119
+
120
+ PingPong:
121
+
122
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fc2346dea82dd667bb0ffbc/uNmD2YhealySO6UYUH8-g.png)
123
+
5bpw/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/saiga_nemo_12b_sft_m9_d14",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 5120,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 8192,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 40,
18
+ "num_key_value_heads": 8,
19
+ "pad_token_id": 10,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.45.1",
26
+ "unsloth_version": "2024.8",
27
+ "use_cache": true,
28
+ "vocab_size": 131072,
29
+ "quantization_config": {
30
+ "quant_method": "exl2",
31
+ "version": "0.2.3",
32
+ "bits": 5.0,
33
+ "head_bits": 8,
34
+ "calibration": {
35
+ "rows": 100,
36
+ "length": 2048,
37
+ "dataset": "toxicator-ru"
38
+ }
39
+ }
40
+ }
5bpw/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 10,
6
+ "max_length": 1024000,
7
+ "max_new_tokens": 2048,
8
+ "temperature": 0.6,
9
+ "top_p": 0.95,
10
+ "repetition_penalty": 1.1,
11
+ "do_sample": true,
12
+ "transformers_version": "4.45.1"
13
+ }
5bpw/model.safetensors.index.json ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 24495564800
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00005-of-00005.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00005.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00005.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00005.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00005.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00005.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00005.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00004-of-00005.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00005.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00005.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00005.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00004-of-00005.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00005.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00005.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00005.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
242
+ "model.layers.32.input_layernorm.weight": "model-00004-of-00005.safetensors",
243
+ "model.layers.32.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
244
+ "model.layers.32.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
245
+ "model.layers.32.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
246
+ "model.layers.32.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
247
+ "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
248
+ "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
249
+ "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
250
+ "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
251
+ "model.layers.33.input_layernorm.weight": "model-00005-of-00005.safetensors",
252
+ "model.layers.33.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
253
+ "model.layers.33.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
254
+ "model.layers.33.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
255
+ "model.layers.33.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
256
+ "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
257
+ "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
258
+ "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
259
+ "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
260
+ "model.layers.34.input_layernorm.weight": "model-00005-of-00005.safetensors",
261
+ "model.layers.34.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
262
+ "model.layers.34.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
263
+ "model.layers.34.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
264
+ "model.layers.34.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
265
+ "model.layers.34.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
266
+ "model.layers.34.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
267
+ "model.layers.34.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
268
+ "model.layers.34.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
269
+ "model.layers.35.input_layernorm.weight": "model-00005-of-00005.safetensors",
270
+ "model.layers.35.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
271
+ "model.layers.35.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
272
+ "model.layers.35.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
273
+ "model.layers.35.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
274
+ "model.layers.35.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
275
+ "model.layers.35.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
276
+ "model.layers.35.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
277
+ "model.layers.35.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
278
+ "model.layers.36.input_layernorm.weight": "model-00005-of-00005.safetensors",
279
+ "model.layers.36.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
280
+ "model.layers.36.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
281
+ "model.layers.36.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
282
+ "model.layers.36.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
283
+ "model.layers.36.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
284
+ "model.layers.36.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
285
+ "model.layers.36.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
286
+ "model.layers.36.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
287
+ "model.layers.37.input_layernorm.weight": "model-00005-of-00005.safetensors",
288
+ "model.layers.37.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
289
+ "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
290
+ "model.layers.37.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
291
+ "model.layers.37.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
292
+ "model.layers.37.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
293
+ "model.layers.37.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
294
+ "model.layers.37.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
295
+ "model.layers.37.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
296
+ "model.layers.38.input_layernorm.weight": "model-00005-of-00005.safetensors",
297
+ "model.layers.38.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
298
+ "model.layers.38.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
299
+ "model.layers.38.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
300
+ "model.layers.38.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
301
+ "model.layers.38.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
302
+ "model.layers.38.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
303
+ "model.layers.38.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
304
+ "model.layers.38.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
305
+ "model.layers.39.input_layernorm.weight": "model-00005-of-00005.safetensors",
306
+ "model.layers.39.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
307
+ "model.layers.39.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
308
+ "model.layers.39.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
309
+ "model.layers.39.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
310
+ "model.layers.39.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
311
+ "model.layers.39.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
312
+ "model.layers.39.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
313
+ "model.layers.39.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
314
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
315
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
316
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
317
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
318
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
319
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
320
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
321
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
322
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
323
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
324
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
325
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
326
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
327
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
328
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
329
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
330
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
331
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
332
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00005.safetensors",
333
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
334
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
335
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
336
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
337
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
338
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
339
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
340
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
341
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00005.safetensors",
342
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
343
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
344
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
345
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
346
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
347
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
348
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
349
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
350
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
351
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
352
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
353
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
354
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
355
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
356
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
357
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
358
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
359
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
360
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
361
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
362
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
363
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
364
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
365
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
366
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
367
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
368
+ "model.norm.weight": "model-00005-of-00005.safetensors"
369
+ }
370
+ }
5bpw/output-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:555d7b41dcf6f3163fc01f52c6ab756e942af1e402b4d3f59237946b7ae3425a
3
+ size 8158571264
5bpw/output-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:759c3eb2b140f413a097a41eec72289a2ac2968b5be27db67dce58e255c4a3b7
3
+ size 671088736
5bpw/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ }
12
+ }
5bpw/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0240ce510f08e6c2041724e9043e33be9d251d1e4a4d94eb68cd47b954b61d2
3
+ size 17078292
5bpw/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
6.5bpw/.gitattributes ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model-00004-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
37
+ model-00005-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
38
+ model-00001-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
39
+ model-00002-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
40
+ model-00003-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
41
+ orig_adapter/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
42
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
6.5bpw/README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ru
4
+ datasets:
5
+ - IlyaGusev/saiga_scored
6
+ - IlyaGusev/saiga_preferences
7
+ license: apache-2.0
8
+ ---
9
+
10
+
11
+ # Saiga/MistralNemo 12B, Russian fine-tune of Mistral Nemo
12
+
13
+ Based on [an abliterated version](https://huggingface.co/natong19/Mistral-Nemo-Instruct-2407-abliterated) of [Mistral Nemo](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407).
14
+
15
+ Llama.cpp version: TBD
16
+
17
+ Colab: [link](https://colab.research.google.com/drive/1vNzMyPqx2GB7zk3ANDtZEfvhzgYOWu0B)
18
+
19
+ ## Prompt format
20
+
21
+ Original Misral Nemo prompt format, but the system prompt is in the beginning:
22
+ ```
23
+ <s>Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им.
24
+ [INST]Как дела?[/INST]
25
+ Отлично, а у тебя?</s>
26
+ [INST]Шикарно. Как пройти в библиотеку?[/INST]
27
+ ```
28
+
29
+ ## Code example
30
+ ```python
31
+ # Исключительно ознакомительный пример.
32
+ # НЕ НАДО ТАК ИНФЕРИТЬ МОДЕЛЬ В ПРОДЕ.
33
+ # См. https://github.com/vllm-project/vllm или https://github.com/huggingface/text-generation-inference
34
+
35
+ import torch
36
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
37
+
38
+ MODEL_NAME = "IlyaGusev/saiga_nemo_12b"
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ MODEL_NAME,
42
+ load_in_8bit=True,
43
+ torch_dtype=torch.bfloat16,
44
+ device_map="auto"
45
+ )
46
+ model.eval()
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
49
+ generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
50
+ print(generation_config)
51
+
52
+ inputs = ["Почему трава зеленая?", "Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч"]
53
+ for query in inputs:
54
+ prompt = tokenizer.apply_chat_template([{
55
+ "role": "user",
56
+ "content": query
57
+ }], tokenize=False, add_generation_prompt=True)
58
+ data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
59
+ data = {k: v.to(model.device) for k, v in data.items()}
60
+ data.pop("token_type_ids", None)
61
+ output_ids = model.generate(**data, generation_config=generation_config)[0]
62
+ output_ids = output_ids[len(data["input_ids"][0]):]
63
+ output = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
64
+ print(query)
65
+ print(output)
66
+ print()
67
+ print("==============================")
68
+ print()
69
+ ```
70
+
71
+ ## Output examples
72
+ ```
73
+ User: Почему трава зеленая?
74
+ Saiga: Трава зеленая по нескольким основным причинам:
75
+ 1. **Хлорофилл**: Основной причиной зеленого цвета травы является наличие хлорофилла, пигмента в клетках растений, который играет ключевую роль в процессе фотосинтеза. Хлорофилл поглощает световые волны синего и красного спектра, а отражение зелёных волн обуславливает именно этот цвет.
76
+ 2. **Процесс фотосинтеза**: Хлорофилл не только придаёт растениям зелёный окрас, но и обеспечивает процесс фотосинтеза, благодаря которому растение превращает солнечный свет, углекислый газ и воду в питательные вещества (в основном глюкозу), кислород и энергию. Этот процесс необходим для жизни растений и их роста.
77
+ 3. **Адаптация к окружающей среде**: Зеленый цвет оптимален для максимального поглощения света в условиях на Земле. Наша атмосфера рассеивает больше синего и красного света, чем зелёного, что делает его наиболее доступным источником света для растений на поверхности планеты.
78
+ 4. **Взаимодействие с окружающими условиями**: При различных условиях (например, недостатке воды или изменении уровня освещённости) трава может менять интенсивность своего зелёного цвета, иногда приобретая желтизну из-за уменьшения содержания хлорофилла или коричневый оттенок из-за образования других пигментов, таких как каротиноиды и антоцианы.
79
+ Таким образом, зеленый цвет травы — это результат эволюционной адаптации растений к земным условиям, определяемый прежде всего присутствием хлорофилла и необходимостью эффективно использовать солнечный свет в процессе фотосинтеза.
80
+ ```
81
+
82
+ ```
83
+ User: Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч
84
+ Saiga: ### Сказка о Тане и чудесной встрече
85
+ В далеком пригороде, где улицы были пронизаны зеленью деревьев, а дома казались игрушками в огромной коробке, жила девочка по имени Таня. Её волосы цвета осенней листвы развевались на ветру, а глаза сверкали яркостью летнего неба. И была у неё одна особенность — любовь к мячу. Этот мяч был не просто предметом для игры; он был её верным спутником, другом и даже союзником во всех приключениях.
86
+ Мяч этот, с замысловатым узором из разноцветных полос, словно обладал собственной жизнью. Он мог подсказать, когда пора отправиться на поиски новых друзей или рассказать истории о далеких странах, которые отражались в его шелковистой поверхности.
87
+ Однажды утром, когда первые лучи солнца только начинали пробиваться сквозь листву, Таня решила провести день необычно. Она взяла свой волшебный мяч и направилась в сторону старого парка, что находился на краю города, там, где дороги становились менее знакомыми, а воздух наполнялся загадочными запахами. Мяч пружинил в руках, будто предвкушая что-то особенное.
88
+ Парк встретил её тишиной и полусонными тенистыми аллеями. Таня побежала, играя со своим мячом, который словно летел впереди, маня все дальше. Вдруг, посреди поляны, мяч вырвался из рук, взмыл вверх и... замер, зависнув над землей! Таня остолбенела. Мелькнуло воспоминание о детских сказках, где обыденные вещи вдруг обретали магические свойства.
89
+ — Это сон? — шептала она, делая шаг навстречу парителю.
90
+ В то же мгновение, когда мяч начал медленно снижаться, из-за кустов показалось существо, которое можно было бы принять за маленького человека-животного. Его одежда переливалась всеми цветами радуги, как и узоры на мяче Тани. Существо оказалось Лесным Хранителем — добрым духом парка, хранителем всех тайн и чудес.
91
+ — Здравствуй, Таня, — сказал Хранитель, голос его звучал как мелодичное журчание ручья. — Я ждал тебя. Твой мяч — не просто игрушка, это ключ, открывающий сердца. Ты обладаешь даром видеть мир иначе, и я здесь, чтобы научить тебя этим силам.
92
+ Таня, хоть и испугалась сначала, но любопытство и искреннее желание понять новое быстро преодолели страх. Вместе они начали путешествие по волшебному миру парка. Мяч теперь светился, указывая дорогу, и каждый раз, когда он касался земли, вокруг оживали старые легенды: танцевали светлячки, разговаривали деревья, а птицы пели песни забытых эпох.
93
+ Вскоре Таня узнала, что каждый цвет на её мяче символизировал силу: синий — мудрость, красный — смелость, зелёный — связь с природой, а белый — чистоту намерений. Используя эти силы, она помогала обитателям парка: спасла заблудившихся детей, исцелила больную птицу и даже примирила два враждующих духа-деревьев.
94
+ Когда сумерки опустились на парк, Лесной Хранитель вернулся к своей вековой задаче, но перед этим подарил Танье маленький семечко:
95
+ — Посади его рядом с твоим домом, и пусть растет вместе с тобой. Каждый раз, глядя на него, ты будешь помнить о мире, где всё возможно.
96
+ На следующий день Таня вернулась домой, держа в руках не только любимый мяч, но и новый источник магии. Семя посадили, и вскоре оно превратилось в прекрасное дерево, расцветающее разноцветными цветами — словно продолжение самого мяча. Теперь каждая встреча с ним напоминала о тех приключениях и тайнах, что живут среди нас, если знать, куда и как смотреть.
97
+ И так, благодаря своему верному мячу, Таня стала хранительницей маленьких чудес в своём городе, доказывая всем, что волшебство может быть самым обычным предметом, лишь бы сердце было открыто для приключений и добра.
98
+ ```
99
+
100
+
101
+ ## Versions
102
+ v1:
103
+ - [87a83ce252ff0142cd4cc918fb3e6a9875ca4638](https://huggingface.co/IlyaGusev/saiga_nemo_12b/commit/87a83ce252ff0142cd4cc918fb3e6a9875ca4638)
104
+ - Other name: saiga_nemo_12b_sft_m9_d14_simpo_m19_d31
105
+ - SFT dataset config: [sft_d14.json](https://github.com/IlyaGusev/saiga/blob/main/configs/datasets/sft_d14.json)
106
+ - SFT model config: [saiga_nemo_12b_sft_m9.json](https://github.com/IlyaGusev/saiga/blob/main/configs/models/saiga_nemo_12b_sft_m9.json)
107
+ - SimPO dataset config: [pref_d31.json](https://github.com/IlyaGusev/saiga/blob/main/configs/datasets/pref_d31.json)
108
+ - SimPO model config: [saiga_nemo_12b_simpo_m19.json](https://github.com/IlyaGusev/saiga/blob/main/configs/models/saiga_nemo_12b_simpo_m19.json)
109
+ - SFT wandb: [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/e74ozfzh)
110
+ - SimPO wandb: [link](https://wandb.ai/ilyagusev/rulm_self_instruct/runs/b094iiej)
111
+
112
+
113
+ ## Evaluation
114
+
115
+ RuArenaHard:
116
+
117
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fc2346dea82dd667bb0ffbc/-uG--3Wu9oUi9_bC_ZFP4.png)
118
+
119
+
120
+ PingPong:
121
+
122
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fc2346dea82dd667bb0ffbc/uNmD2YhealySO6UYUH8-g.png)
123
+
6.5bpw/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/saiga_nemo_12b_sft_m9_d14",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 5120,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 8192,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 40,
18
+ "num_key_value_heads": 8,
19
+ "pad_token_id": 10,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.45.1",
26
+ "unsloth_version": "2024.8",
27
+ "use_cache": true,
28
+ "vocab_size": 131072,
29
+ "quantization_config": {
30
+ "quant_method": "exl2",
31
+ "version": "0.2.3",
32
+ "bits": 6.5,
33
+ "head_bits": 8,
34
+ "calibration": {
35
+ "rows": 100,
36
+ "length": 2048,
37
+ "dataset": "toxicator-ru"
38
+ }
39
+ }
40
+ }
6.5bpw/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 10,
6
+ "max_length": 1024000,
7
+ "max_new_tokens": 2048,
8
+ "temperature": 0.6,
9
+ "top_p": 0.95,
10
+ "repetition_penalty": 1.1,
11
+ "do_sample": true,
12
+ "transformers_version": "4.45.1"
13
+ }
6.5bpw/model.safetensors.index.json ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 24495564800
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00005-of-00005.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00005.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00005.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00005.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00005.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00005.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00005.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00004-of-00005.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00005.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00005.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00005.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00004-of-00005.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00005.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00005.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00005.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
242
+ "model.layers.32.input_layernorm.weight": "model-00004-of-00005.safetensors",
243
+ "model.layers.32.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
244
+ "model.layers.32.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
245
+ "model.layers.32.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
246
+ "model.layers.32.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
247
+ "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
248
+ "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
249
+ "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
250
+ "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
251
+ "model.layers.33.input_layernorm.weight": "model-00005-of-00005.safetensors",
252
+ "model.layers.33.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
253
+ "model.layers.33.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
254
+ "model.layers.33.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
255
+ "model.layers.33.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
256
+ "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
257
+ "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
258
+ "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
259
+ "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
260
+ "model.layers.34.input_layernorm.weight": "model-00005-of-00005.safetensors",
261
+ "model.layers.34.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
262
+ "model.layers.34.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
263
+ "model.layers.34.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
264
+ "model.layers.34.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
265
+ "model.layers.34.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
266
+ "model.layers.34.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
267
+ "model.layers.34.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
268
+ "model.layers.34.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
269
+ "model.layers.35.input_layernorm.weight": "model-00005-of-00005.safetensors",
270
+ "model.layers.35.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
271
+ "model.layers.35.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
272
+ "model.layers.35.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
273
+ "model.layers.35.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
274
+ "model.layers.35.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
275
+ "model.layers.35.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
276
+ "model.layers.35.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
277
+ "model.layers.35.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
278
+ "model.layers.36.input_layernorm.weight": "model-00005-of-00005.safetensors",
279
+ "model.layers.36.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
280
+ "model.layers.36.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
281
+ "model.layers.36.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
282
+ "model.layers.36.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
283
+ "model.layers.36.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
284
+ "model.layers.36.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
285
+ "model.layers.36.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
286
+ "model.layers.36.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
287
+ "model.layers.37.input_layernorm.weight": "model-00005-of-00005.safetensors",
288
+ "model.layers.37.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
289
+ "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
290
+ "model.layers.37.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
291
+ "model.layers.37.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
292
+ "model.layers.37.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
293
+ "model.layers.37.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
294
+ "model.layers.37.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
295
+ "model.layers.37.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
296
+ "model.layers.38.input_layernorm.weight": "model-00005-of-00005.safetensors",
297
+ "model.layers.38.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
298
+ "model.layers.38.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
299
+ "model.layers.38.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
300
+ "model.layers.38.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
301
+ "model.layers.38.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
302
+ "model.layers.38.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
303
+ "model.layers.38.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
304
+ "model.layers.38.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
305
+ "model.layers.39.input_layernorm.weight": "model-00005-of-00005.safetensors",
306
+ "model.layers.39.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
307
+ "model.layers.39.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
308
+ "model.layers.39.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
309
+ "model.layers.39.post_attention_layernorm.weight": "model-00005-of-00005.safetensors",
310
+ "model.layers.39.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
311
+ "model.layers.39.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
312
+ "model.layers.39.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
313
+ "model.layers.39.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
314
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
315
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
316
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
317
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
318
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
319
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
320
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
321
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
322
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
323
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
324
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
325
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
326
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
327
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
328
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
329
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
330
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
331
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
332
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00005.safetensors",
333
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
334
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
335
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
336
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
337
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
338
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
339
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
340
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
341
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00005.safetensors",
342
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
343
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
344
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
345
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
346
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
347
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
348
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
349
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
350
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
351
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
352
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
353
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
354
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
355
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
356
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
357
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
358
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
359
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
360
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
361
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
362
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
363
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
364
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
365
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
366
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
367
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
368
+ "model.norm.weight": "model-00005-of-00005.safetensors"
369
+ }
370
+ }
6.5bpw/output-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88f54507f5b8fb292fb9fa25ceedd3778c16e88a4d768927abdbadf87205a73e
3
+ size 8533895676
6.5bpw/output-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d8219f8e56b714fea2ceaa0fbb71492e0d84d140fb5bf1083f69575de799d61
3
+ size 2300669736
6.5bpw/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ }
12
+ }
6.5bpw/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0240ce510f08e6c2041724e9043e33be9d251d1e4a4d94eb68cd47b954b61d2
3
+ size 17078292
6.5bpw/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff