File size: 3,405 Bytes

d1e5b17

# Model Card for Model mistral-trimegistus-7b-gguf

This model repo holds gguf quantized versions of ["teknium/Mistral-Trismegistus-7B"] (https://huggingface.co/teknium/Mistral-Trismegistus-7B).

## Model Details

Transcendence is All You Need! Mistral Trismegistus is a model made for people interested in the esoteric, occult, and spiritual. 

### Model Description


- The First Powerful Occult Expert Model: ~10,000 high quality, deep, rich, instructions on the occult, esoteric, and spiritual.

- Fast: Trained on Mistral, a state of the art 7B parameter model, you can run this model FAST on even a cpu.
- Not a positivity-nazi: This model was trained on all forms of esoteric tasks and knowledge, and is not burdened by the flowery nature of many other models, who chose positivity over creativity.


### Model Sources [optional]

All credits go [here](https://huggingface.co/teknium/Mistral-Trismegistus-7B)

## Usage

USER: <prompt>
ASSISTANT:

OR

<system message>
USER: <prompt>
ASSISTANT:


## Training Details

#### Training Hyperparameters

 "_name_or_path": {
    "desc": null,
    "value": "mistralai/Mistral-7B-v0.1"
  },
  "architectures": {
    "desc": null,
    "value": [
      "MistralForCausalLM"
    ]
  },
  "bad_words_ids": {
    "desc": null,
    "value": null
  },
  "bench_dataset": {
    "desc": null,
    "value": "pharaouk/dharma-1/dharma_1_mini.json"
  },
  "learning_rate": {
    "desc": null,
    "value": 0.0004
  },
  "max_grad_norm": {
    "desc": null,
    "value": 1
  },
  "fp16_opt_level": {
    "desc": null,
    "value": "O1"
  },
  "length_penalty": {
    "desc": null,
    "value": 1
  },
  "max_seq_length": {
    "desc": null,
    "value": 4096
  },
  "sliding_window": {
    "desc": null,
    "value": 4096
  },
  "num_beam_groups": {
    "desc": null,
    "value": 1
  },
  "initializer_range": {
    "desc": null,
    "value": 0.02
  },
  "intermediate_size": {
    "desc": null,
    "value": 14336
  },
  "lr_scheduler_type": {
    "desc": null,
    "value": "cosine"
  },
  "num_hidden_layers": {
    "desc": null,
    "value": 32
  },
  "repetition_penalty": {
    "desc": null,
    "value": 1
  },
  "evaluation_strategy": {
    "desc": null,
    "value": "steps"
  },
  "num_attention_heads": {
    "desc": null,
    "value": 32
  },
  "num_key_value_heads": {
    "desc": null,
    "value": 8
  },
  "quantization_config": {
    "desc": null,
    "value": {
      "load_in_4bit": true,
      "load_in_8bit": false,
      "quant_method": "QuantizationMethod.BITS_AND_BYTES",
      "llm_int8_threshold": 6,
      "bnb_4bit_quant_type": "nf4",
      "llm_int8_skip_modules": null,
      "bnb_4bit_compute_dtype": "bfloat16",
      "llm_int8_has_fp16_weight": false,
      "bnb_4bit_use_double_quant": true,
      "llm_int8_enable_fp32_cpu_offload": false
    }
  }


#### Speeds, Sizes, Times 

{
  "_step": 9589,
  "_wandb.runtime": 12960,
  "_runtime": 12960.192620515823,
  "eval/loss": 1.4308836460113523,
  "train/train_steps_per_second": 0.739,
  "train/train_samples_per_second": 2.956,
  "train/loss": 0.3396,
  "train/epoch": 4,
  "train/total_flos": 1757020072120942600,
  "train/train_loss": 0.8929485179171377,
  "train/learning_rate": 0,
  "eval/steps_per_second": 2.196,
  "_timestamp": 1696542775.2713604,
  "eval/runtime": 11.3829,
  "train/global_step": 9584,
  "train/train_runtime": 12962.7813,
  "eval/samples_per_second": 8.522
}