Upload folder using huggingface_hub

Browse files

Uploading initialised weights and configs

Files changed (12) hide show

config.json +26 -0
generation_config.json +6 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +118 -0
run.sh +18 -0
run_initialization.py +199 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.40.0.dev0"
+}

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e00f58089ce8591445b2fa83d7d000a3bba1d19ef3d264301feba0e3c6b6b78b
+size 4987196936

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdf4831d59aefe0d24df08b5027d72a4b99ea0f5c2051a193f128cdb0b049c0a
+size 4899116440

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0caa70b4e09a2aa2b7ca0092e24c0d80735c4b42888d382b10ea84f5d6350b01
+size 1631667768

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "metadata": {
+    "total_size": 11517968384
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}

run.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --partition=hopper-cpu
+#SBATCH --name=mistral-init
+#SBATCH --mem=1g
+#SBATCH --time=1:00:00
+#SBATCH --cpus-per-task=1
+#SBATCH --mem-per-cpu=1
+#SBATCH -o /fsx/sanchit/logs/init-%j-%x.out
+echo "Starting job"
+srun python3 run_initialization.py \
+  --model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
+  --num_hidden_layers "12" \
+  --output_dir "./" \
+  --push_to_hub
+wait

run_initialization.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import copy
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+from huggingface_hub import create_repo, get_full_repo_name, upload_folder
+from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
+    """
+    model_name_or_path: Optional[str] = field(
+        metadata={"help": "The teacher checkpoint for weights initialization"},
+    )
+    output_dir: str = field(
+        metadata={"help": "The output directory where the student checkpoint will be written."},
+    )
+    model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": "The specific teacher model version to use (can be a branch name, tag name or commit id)."},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co"},
+    )
+    subfolder: Optional[str] = field(
+        default="",
+        metadata={
+            "help": "In case the relevant files are located inside a subfolder of the teacher model repo on huggingface.co, you can"
+            "specify the folder name here."
+        },
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the teacher model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: Optional[bool] = field(
+        default=False, metadata={"help": "Trust remote code when loading a model."}
+    )
+    token: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` necessary to use this script with private models)."
+        },
+    )
+    num_hidden_layers: Optional[int] = field(
+        default=6,
+        metadata={"help": "The number of hidden layers in the Transformer decoder."},
+    )
+    push_to_hub: Optional[bool] = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: Optional[str] = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    low_cpu_mem_usage: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "Create the teacher model as an empty shell, and only materialize its parameters when the pretrained weights are loaded. "
+            "Significantly benefits loading time and RAM consumption."
+        },
+    )
+    initialization_strategy: Optional[str] = field(
+        default="maximally_spaced",
+        metadata={
+            "help": "The weight initialization strategy for the decoder weights. Either `first_n`, or `maximally_spaced`."
+        },
+    )
+def main():
+    # 1. Parse input arguments
+    parser = HfArgumentParser(ModelArguments)
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
+    else:
+        model_args = parser.parse_args_into_dataclasses()[0]
+    logger.info(f"Model parameters {model_args}")
+    logger.info("*** Load pretrained teacher model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    # quantization_config = get_quantization_config(model_args)
+    teacher_model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=model_args.low_cpu_mem_usage,
+        revision=model_args.model_revision,
+        cache_dir=model_args.cache_dir,
+        subfolder=model_args.subfolder,
+        trust_remote_code=model_args.trust_remote_code,
+        token=model_args.token,
+        # device_map=get_kbit_device_map() if quantization_config is not None else None,
+        # quantization_config=quantization_config,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    generation_config = teacher_model.generation_config
+    teacher_config = teacher_model.config
+    logger.info("*** Teacher model loaded! ***")
+    student_config = copy.deepcopy(teacher_config)
+    student_config.num_hidden_layers = model_args.num_hidden_layers
+    teacher_hidden_layers = teacher_config.num_hidden_layers
+    if model_args.initialization_strategy == "maximally_spaced":
+        decoder_mapping = np.linspace(0, teacher_hidden_layers - 1, student_config.num_hidden_layers, dtype=int)
+    elif model_args.initialization_strategy == "first_n":
+        decoder_mapping = np.arange(0, student_config.num_hidden_layers)
+    else:
+        raise ValueError(
+            f"Got invalid initialization_strategy strategy '{model_args.initialization_strategy}', should be one of "
+            "'maximally_spaced` or `first_n`."
+        )
+    # always use the last teacher layer as the last student layer
+    decoder_mapping[-1] = teacher_hidden_layers - 1
+    decoder_map = {}
+    for student_layer, teacher_layer in enumerate(decoder_mapping):
+        decoder_map[teacher_layer] = student_layer
+    # init the student params from the teacher model
+    logger.info("*** Load and initialise student model ***")
+    student_model = AutoModelForCausalLM.from_config(student_config)
+    missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
+    student_model.to(dtype=torch_dtype)
+    if len(missing_keys) > 0:
+        raise RuntimeError(
+            f"Error(s) in loading state_dict for {student_model.__class__.__name__}. \n"
+            f"Missing key(s) in state_dict: {missing_keys}"
+        )
+    if student_config.num_hidden_layers == teacher_hidden_layers:
+        decoder_keys = [key for key in unexpected_keys if "model.layers" in key]
+        if len(decoder_keys) > 0:
+            raise RuntimeError(
+                f"Error(s) in loading state_dict for {student_model.__class__.__name__}. \n"
+                f"Unexpected key(s) in state_dict: {decoder_keys}"
+            )
+    for layer in range(teacher_hidden_layers):
+        if layer in decoder_map:
+            # re-introduce pre-defined layers from the teacher
+            student_model.model.layers[decoder_map[layer]].load_state_dict(
+                teacher_model.model.layers[layer].state_dict()
+            )
+    logger.info("*** Student model loaded! ***")
+    # remove the teacher params and model
+    del teacher_model
+    # save the converted weights and model
+    if model_args.output_dir is not None:
+        student_model.save_pretrained(model_args.output_dir)
+        # we also need to correctly save the processor and generation config
+        tokenizer.save_pretrained(model_args.output_dir)
+        generation_config.save_pretrained(model_args.output_dir)
+    if model_args.push_to_hub:
+        if model_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(model_args.output_dir).absolute().name,
+                token=model_args.token,
+            )
+        else:
+            repo_name = model_args.hub_model_id
+        create_repo(repo_name, exist_ok=True, token=model_args.token)
+        upload_folder(
+            repo_id=repo_name,
+            folder_path=model_args.output_dir,
+            commit_description="Uploading initialised weights and configs",
+        )
+if __name__ == "__main__":
+    main()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}