Model config.json has Mistral params instead of Mixtral, breaking ExLlama quants and maybe affecting others too

Browse files

I got reports that ExLlamav2 wasn't working with this GPTQ. Turns out that's because it's trying to load it as a Mistral model, which is due to the `architecture` being set to Mistral instead of Mixtral

Also, the `rope_theta` should be 1000000.0 for Mixtral - this can affect inference quality.

I don't think any of this would stop k-quants working though, so that issue might be unrelated. I'll try making some anyway though.

Files changed (1) hide show

config.json +4 -4

config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_name_or_path": "jondurbin/bagel-7b-v0.1",
   "architectures": [
-    "MistralForCausalLM"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 1,
@@ -19,12 +19,12 @@
   "num_local_experts": 8,
   "output_router_logits": false,
   "rms_norm_eps": 1e-05,
-  "rope_theta": 10000.0,
-  "router_aux_loss_coef": 0.001,
   "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.37.0.dev0",
-  "use_cache": false,
   "vocab_size": 32000
 }

 {
   "_name_or_path": "jondurbin/bagel-7b-v0.1",
   "architectures": [
+    "MixtralForCausalLM"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "num_local_experts": 8,
   "output_router_logits": false,
   "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "router_aux_loss_coef": 0.02,
   "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.37.0.dev0",
+  "use_cache": true,
   "vocab_size": 32000
 }