Quantized using nncf 2.13.0

Files changed (7) hide show

README.md CHANGED Viewed

@@ -7,14 +7,12 @@ tags:
 This is an INT4 quantized version of the `meta-llama/Llama-2-13b-chat-hf` model. The Python packages used in creating this model are as follows:
 ```
-openvino==2024.3.0.dev20240528
-openvino-nightly==2024.3.0.dev20240528
-openvino-tokenizers==2024.3.0.0.dev20240528
-optimum==1.19.2
-optimum-intel==1.17.0.dev0+aefabf0
-nncf==2.11.0.dev0+90a7f0d5
-torch==2.3.0+cu121
-transformers==4.40.2
 ```
 This quantized model is created using the following command:
 ```
@@ -25,5 +23,5 @@ For more details, run the following command from your Python environment: `optim
 INFO:nncf:Statistics of the bitwidth distribution:
 | Num bits (N) | % all parameters (layers) | % ratio-defining parameters (layers) |
 |--------------|---------------------------|--------------------------------------|
-| 8 | 22% (83 / 282) | 20% (81 / 280) |
-| 4 | 78% (199 / 282) | 80% (199 / 280) |

 This is an INT4 quantized version of the `meta-llama/Llama-2-13b-chat-hf` model. The Python packages used in creating this model are as follows:
 ```
+openvino==2024.4.0
+optimum==1.23.3
+optimum-intel==1.20.1
+nncf==2.13.0
+torch==2.5.1
+transformers==4.46.1
 ```
 This quantized model is created using the following command:
 ```
 INFO:nncf:Statistics of the bitwidth distribution:
 | Num bits (N) | % all parameters (layers) | % ratio-defining parameters (layers) |
 |--------------|---------------------------|--------------------------------------|
+| 8 | 3% (2 / 282) | 0% (0 / 280) |
+| 4 | 97% (280 / 282) | 100% (280 / 280) |

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
   "architectures": [
     "LlamaForCausalLM"
@@ -7,11 +8,13 @@
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "eos_token_id": 2,
   "hidden_act": "silu",
   "hidden_size": 5120,
   "initializer_range": 0.02,
   "intermediate_size": 13824,
   "max_position_embeddings": 4096,
   "model_type": "llama",
   "num_attention_heads": 40,
   "num_hidden_layers": 40,
@@ -21,7 +24,8 @@
   "rope_scaling": null,
   "rope_theta": 10000.0,
   "tie_word_embeddings": false,
-  "transformers_version": "4.40.2",
   "use_cache": true,
   "vocab_size": 32000
 }

 {
+  "_attn_implementation_autoset": true,
   "_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
   "architectures": [
     "LlamaForCausalLM"
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "eos_token_id": 2,
+  "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 5120,
   "initializer_range": 0.02,
   "intermediate_size": 13824,
   "max_position_embeddings": 4096,
+  "mlp_bias": false,
   "model_type": "llama",
   "num_attention_heads": 40,
   "num_hidden_layers": 40,
   "rope_scaling": null,
   "rope_theta": 10000.0,
   "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.46.1",
   "use_cache": true,
   "vocab_size": 32000
 }

generation_config.json CHANGED Viewed

@@ -6,5 +6,5 @@
   "pad_token_id": 0,
   "temperature": 0.6,
   "top_p": 0.9,
-  "transformers_version": "4.40.2"
 }

   "pad_token_id": 0,
   "temperature": 0.6,
   "top_p": 0.9,
+  "transformers_version": "4.46.1"
 }

openvino_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96be0292e81bca0757ad3e9bbfb936ad92cd10a3404e4022b42e029cf23c4883
-size 8157059168

 version https://git-lfs.github.com/spec/v1
+oid sha256:064cdc248bbfdd396d26d976af4f74f2f9ea4dfe01a00df1128df88a0a9dd7c8
+size 6921224160

openvino_model.xml CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "add_bos_token": true,
   "add_eos_token": false,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",

 {
   "add_bos_token": true,
   "add_eos_token": false,
+  "add_prefix_space": null,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",