Model config.json has Mistral params instead of Mixtral, breaking ExLlama quants and maybe affecting others too
Browse filesI got reports that ExLlamav2 wasn't working with this GPTQ. Turns out that's because it's trying to load it as a Mistral model, which is due to the `architecture` being set to Mistral instead of Mixtral
Also, the `rope_theta` should be 1000000.0 for Mixtral - this can affect inference quality.
I don't think any of this would stop k-quants working though, so that issue might be unrelated. I'll try making some anyway though.
- config.json +4 -4
config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"_name_or_path": "jondurbin/bagel-7b-v0.1",
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"attention_dropout": 0.0,
|
7 |
"bos_token_id": 1,
|
@@ -19,12 +19,12 @@
|
|
19 |
"num_local_experts": 8,
|
20 |
"output_router_logits": false,
|
21 |
"rms_norm_eps": 1e-05,
|
22 |
-
"rope_theta":
|
23 |
-
"router_aux_loss_coef": 0.
|
24 |
"sliding_window": 4096,
|
25 |
"tie_word_embeddings": false,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
"transformers_version": "4.37.0.dev0",
|
28 |
-
"use_cache":
|
29 |
"vocab_size": 32000
|
30 |
}
|
|
|
1 |
{
|
2 |
"_name_or_path": "jondurbin/bagel-7b-v0.1",
|
3 |
"architectures": [
|
4 |
+
"MixtralForCausalLM"
|
5 |
],
|
6 |
"attention_dropout": 0.0,
|
7 |
"bos_token_id": 1,
|
|
|
19 |
"num_local_experts": 8,
|
20 |
"output_router_logits": false,
|
21 |
"rms_norm_eps": 1e-05,
|
22 |
+
"rope_theta": 1000000.0,
|
23 |
+
"router_aux_loss_coef": 0.02,
|
24 |
"sliding_window": 4096,
|
25 |
"tie_word_embeddings": false,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
"transformers_version": "4.37.0.dev0",
|
28 |
+
"use_cache": true,
|
29 |
"vocab_size": 32000
|
30 |
}
|