Upload config.json with huggingface_hub

#13
Files changed (1) hide show
  1. config.json +3 -40
config.json CHANGED
@@ -2,52 +2,15 @@
2
  "architectures": [
3
  "ArcticForCausalLM"
4
  ],
5
- "attention_dropout": 0,
6
  "auto_map": {
7
  "AutoConfig": "configuration_arctic.ArcticConfig",
8
  "AutoModel": "modeling_arctic.ArcticModel",
9
  "AutoModelForCausalLM": "modeling_arctic.ArcticForCausalLM",
10
  "AutoModelForSequenceClassification": "modeling_arctic.ArcticForSequenceClassification"
11
  },
 
12
  "bos_token_id": 31998,
13
  "enable_expert_tensor_parallelism": false,
14
- "enc_index": [
15
- 0,
16
- 1,
17
- 2,
18
- 3,
19
- 4,
20
- 5,
21
- 6,
22
- 7,
23
- 8,
24
- 9,
25
- 10,
26
- 11,
27
- 12,
28
- 13,
29
- 14,
30
- 15,
31
- 16,
32
- 17,
33
- 18,
34
- 19,
35
- 20,
36
- 21,
37
- 22,
38
- 23,
39
- 24,
40
- 25,
41
- 26,
42
- 27,
43
- 28,
44
- 29,
45
- 30,
46
- 31,
47
- 32,
48
- 33,
49
- 34
50
- ],
51
  "eos_token_id": 31999,
52
  "hidden_act": "silu",
53
  "hidden_size": 7168,
@@ -64,7 +27,7 @@
64
  "num_attention_heads": 56,
65
  "num_experts_per_tok": 2,
66
  "num_hidden_layers": 35,
67
- "num_key_value_heads": 56,
68
  "num_local_experts": 128,
69
  "parallel_attn_mlp_res": true,
70
  "quantization": null,
@@ -74,7 +37,7 @@
74
  "sliding_window": null,
75
  "tie_word_embeddings": false,
76
  "torch_dtype": "bfloat16",
77
- "transformers_version": "4.39.0.dev0",
78
  "use_cache": true,
79
  "use_residual": true,
80
  "vocab_size": 32000
 
2
  "architectures": [
3
  "ArcticForCausalLM"
4
  ],
 
5
  "auto_map": {
6
  "AutoConfig": "configuration_arctic.ArcticConfig",
7
  "AutoModel": "modeling_arctic.ArcticModel",
8
  "AutoModelForCausalLM": "modeling_arctic.ArcticForCausalLM",
9
  "AutoModelForSequenceClassification": "modeling_arctic.ArcticForSequenceClassification"
10
  },
11
+ "attention_dropout": 0,
12
  "bos_token_id": 31998,
13
  "enable_expert_tensor_parallelism": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "eos_token_id": 31999,
15
  "hidden_act": "silu",
16
  "hidden_size": 7168,
 
27
  "num_attention_heads": 56,
28
  "num_experts_per_tok": 2,
29
  "num_hidden_layers": 35,
30
+ "num_key_value_heads": 8,
31
  "num_local_experts": 128,
32
  "parallel_attn_mlp_res": true,
33
  "quantization": null,
 
37
  "sliding_window": null,
38
  "tie_word_embeddings": false,
39
  "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.40.0.dev0",
41
  "use_cache": true,
42
  "use_residual": true,
43
  "vocab_size": 32000