ZEUS 8B 🌩️ V2 - ABLITERATED

V2 abliterated using the following script:
import gc
import random

import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_ID = "T145/ZEUS-8B-V2"

# More samples can help find the direction better.
NUM_PROMPT_SAMPLES = 32

# Used to skip the first and last layers for the modifications.
SKIP_BEGIN_LAYERS = 1
SKIP_END_LAYERS = 1

# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
LAYER_FRACTION_TO_USE = 0.6

# Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
SCALE_FACTOR = 1.0

torch.inference_mode()
torch.set_default_device("cpu")
torch.set_grad_enabled(False)

# Load the model on the GPU in quantized type if we can.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
    low_cpu_mem_usage=True,
    device_map='auto'
)
model.requires_grad_(False)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)

print("Layer index for refusal direction: " + str(layer_idx))

with open("harmful.txt", "r", encoding="utf-8") as f:
    harmful = f.readlines()

with open("harmless.txt", "r", encoding="utf-8") as f:
    harmless = f.readlines()

harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful)))
harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless)))

harmful_toks = [
    tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
                                  return_tensors="pt") for insn in harmful_instructions]
harmless_toks = [
    tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
                                  return_tensors="pt") for insn in harmless_instructions]

bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples")

# Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM.
def generate(toks):
    inputs = tokenizer(toks, return_tensors="pt", padding=True)
    inputs = inputs.to(model.device)
    output = model.generate(
        inputs['input_ids'],
        use_cache=False,
        max_new_tokens=1,
        return_dict_in_generate=True,
        output_hidden_states=True,
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.eos_token_id
    )
    bar_generate.update(n=1)
    return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1.

harmful_hidden = [generate(toks) for toks in harmful_toks]
harmless_hidden = [generate(toks) for toks in harmless_toks]

bar_generate.close()

harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
harmless_mean = torch.stack(harmless_hidden).mean(dim=0)

refusal_dir = harmful_mean - harmless_mean
refusal_dir = refusal_dir.squeeze() / refusal_dir.norm()

torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt")

# Free memory
del model
gc.collect()
torch.cuda.empty_cache()

# Reload the model in CPU memory with bfloat16 data type
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map='cpu'
)
model.requires_grad_(False)

# Make sure it's on the 'cpu' device.
if refusal_dir.device != model.device:
    refusal_dir = refusal_dir.to(model.device)

# Get the language model component and check it's as expected.
lm_model = model.model
assert hasattr(lm_model, 'layers'), "The model does not have the expected structure."

# Check the ranges are valid.
num_layers = len(lm_model.layers)
assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0."
assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0."
assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers."

bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")

# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
    assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
    tensor_float = tensor_data.to(torch.bfloat16)
    refusal_dir_float = refusal_dir.to(torch.bfloat16)
    tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float)
    tensor_modified = tensor_float.to(torch.bfloat16)
    bar_layers.update(1)
    return torch.nn.Parameter(tensor_modified)

# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer.
# NOTE: These tensors names are speific to "llama" and may need changing.
#       - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
    lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
        lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
    )
    lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
        lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
    )

bar_layers.close()

print("Saving modified model (with original tokenizer)...")

FIXED_ID = f"{MODEL_ID}-abliterated"
model.save_pretrained(FIXED_ID)
tokenizer.save_pretrained(FIXED_ID)
According to the script, layer 19 is the primary target for abliteration.
Open LLM Leaderboard Evaluation Results

Detailed results can be found here! Summarized results can be found here!
Metric	Value (%)
Average	29.71
IFEval (0-Shot)	78.95
BBH (3-Shot)	30.98
MATH Lvl 5 (4-Shot)	20.62
GPQA (0-shot)	8.39
MuSR (0-shot)	7.92
MMLU-PRO (5-shot)	31.39
T145
/

ZEUS-8B-V2-abliterated

ZEUS 8B 🌩️ V2 - ABLITERATED

Open LLM Leaderboard Evaluation Results

Model tree for T145/ZEUS-8B-V2-abliterated

Evaluation results