|
--- |
|
library_name: transformers |
|
tags: |
|
- mergekit |
|
- merge |
|
- llama-3.1 |
|
- roleplay |
|
- function calling |
|
base_model: |
|
- T145/ZEUS-8B-V2 |
|
license: llama3.1 |
|
model-index: |
|
- name: ZEUS-8B-V2-abliterated |
|
results: |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: IFEval (0-Shot) |
|
type: wis-k/instruction-following-eval |
|
split: train |
|
args: |
|
num_few_shot: 0 |
|
metrics: |
|
- type: inst_level_strict_acc and prompt_level_strict_acc |
|
value: 78.95 |
|
name: averaged accuracy |
|
source: |
|
url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: BBH (3-Shot) |
|
type: SaylorTwift/bbh |
|
split: test |
|
args: |
|
num_few_shot: 3 |
|
metrics: |
|
- type: acc_norm |
|
value: 30.98 |
|
name: normalized accuracy |
|
source: |
|
url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: MATH Lvl 5 (4-Shot) |
|
type: lighteval/MATH-Hard |
|
split: test |
|
args: |
|
num_few_shot: 4 |
|
metrics: |
|
- type: exact_match |
|
value: 20.62 |
|
name: exact match |
|
source: |
|
url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: GPQA (0-shot) |
|
type: Idavidrein/gpqa |
|
split: train |
|
args: |
|
num_few_shot: 0 |
|
metrics: |
|
- type: acc_norm |
|
value: 8.39 |
|
name: acc_norm |
|
source: |
|
url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: MuSR (0-shot) |
|
type: TAUR-Lab/MuSR |
|
args: |
|
num_few_shot: 0 |
|
metrics: |
|
- type: acc_norm |
|
value: 7.92 |
|
name: acc_norm |
|
source: |
|
url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: MMLU-PRO (5-shot) |
|
type: TIGER-Lab/MMLU-Pro |
|
config: main |
|
split: test |
|
args: |
|
num_few_shot: 5 |
|
metrics: |
|
- type: acc |
|
value: 31.39 |
|
name: accuracy |
|
source: |
|
url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated |
|
name: Open LLM Leaderboard |
|
--- |
|
|
|
# ZEUS 8B 🌩️ V2 - ABLITERATED |
|
|
|
V2 abliterated using the following script: |
|
|
|
```python |
|
import gc |
|
import random |
|
|
|
import torch |
|
from tqdm import tqdm |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
|
|
MODEL_ID = "T145/ZEUS-8B-V2" |
|
|
|
# More samples can help find the direction better. |
|
NUM_PROMPT_SAMPLES = 32 |
|
|
|
# Used to skip the first and last layers for the modifications. |
|
SKIP_BEGIN_LAYERS = 1 |
|
SKIP_END_LAYERS = 1 |
|
|
|
# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers). |
|
LAYER_FRACTION_TO_USE = 0.6 |
|
|
|
# Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less. |
|
SCALE_FACTOR = 1.0 |
|
|
|
torch.inference_mode() |
|
torch.set_default_device("cpu") |
|
torch.set_grad_enabled(False) |
|
|
|
# Load the model on the GPU in quantized type if we can. |
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_ID, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16, |
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16), |
|
low_cpu_mem_usage=True, |
|
device_map='auto' |
|
) |
|
model.requires_grad_(False) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE) |
|
|
|
print("Layer index for refusal direction: " + str(layer_idx)) |
|
|
|
with open("harmful.txt", "r", encoding="utf-8") as f: |
|
harmful = f.readlines() |
|
|
|
with open("harmless.txt", "r", encoding="utf-8") as f: |
|
harmless = f.readlines() |
|
|
|
harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful))) |
|
harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless))) |
|
|
|
harmful_toks = [ |
|
tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False, |
|
return_tensors="pt") for insn in harmful_instructions] |
|
harmless_toks = [ |
|
tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False, |
|
return_tensors="pt") for insn in harmless_instructions] |
|
|
|
bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples") |
|
|
|
# Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM. |
|
def generate(toks): |
|
inputs = tokenizer(toks, return_tensors="pt", padding=True) |
|
inputs = inputs.to(model.device) |
|
output = model.generate( |
|
inputs['input_ids'], |
|
use_cache=False, |
|
max_new_tokens=1, |
|
return_dict_in_generate=True, |
|
output_hidden_states=True, |
|
attention_mask=inputs["attention_mask"], |
|
pad_token_id=tokenizer.eos_token_id |
|
) |
|
bar_generate.update(n=1) |
|
return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1. |
|
|
|
harmful_hidden = [generate(toks) for toks in harmful_toks] |
|
harmless_hidden = [generate(toks) for toks in harmless_toks] |
|
|
|
bar_generate.close() |
|
|
|
harmful_mean = torch.stack(harmful_hidden).mean(dim=0) |
|
harmless_mean = torch.stack(harmless_hidden).mean(dim=0) |
|
|
|
refusal_dir = harmful_mean - harmless_mean |
|
refusal_dir = refusal_dir.squeeze() / refusal_dir.norm() |
|
|
|
torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt") |
|
|
|
# Free memory |
|
del model |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
# Reload the model in CPU memory with bfloat16 data type |
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_ID, |
|
trust_remote_code=True, |
|
torch_dtype=torch.bfloat16, |
|
low_cpu_mem_usage=True, |
|
device_map='cpu' |
|
) |
|
model.requires_grad_(False) |
|
|
|
# Make sure it's on the 'cpu' device. |
|
if refusal_dir.device != model.device: |
|
refusal_dir = refusal_dir.to(model.device) |
|
|
|
# Get the language model component and check it's as expected. |
|
lm_model = model.model |
|
assert hasattr(lm_model, 'layers'), "The model does not have the expected structure." |
|
|
|
# Check the ranges are valid. |
|
num_layers = len(lm_model.layers) |
|
assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0." |
|
assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0." |
|
assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers." |
|
|
|
bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors") |
|
|
|
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less. |
|
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0): |
|
assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..." |
|
tensor_float = tensor_data.to(torch.bfloat16) |
|
refusal_dir_float = refusal_dir.to(torch.bfloat16) |
|
tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float) |
|
tensor_modified = tensor_float.to(torch.bfloat16) |
|
bar_layers.update(1) |
|
return torch.nn.Parameter(tensor_modified) |
|
|
|
# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer. |
|
# NOTE: These tensors names are speific to "llama" and may need changing. |
|
# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures |
|
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS): |
|
lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor( |
|
lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR |
|
) |
|
lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor( |
|
lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR |
|
) |
|
|
|
bar_layers.close() |
|
|
|
print("Saving modified model (with original tokenizer)...") |
|
|
|
FIXED_ID = f"{MODEL_ID}-abliterated" |
|
model.save_pretrained(FIXED_ID) |
|
tokenizer.save_pretrained(FIXED_ID) |
|
``` |
|
|
|
According to the script, **layer 19** is the primary target for abliteration. |
|
|
|
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) |
|
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/T145__ZEUS-8B-V2-abliterated-details)! |
|
Summarized results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q=T145%2FZEUS-8B-V2-abliterated&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc)! |
|
|
|
| Metric |Value (%)| |
|
|-------------------|--------:| |
|
|**Average** | 29.71| |
|
|IFEval (0-Shot) | 78.95| |
|
|BBH (3-Shot) | 30.98| |
|
|MATH Lvl 5 (4-Shot)| 20.62| |
|
|GPQA (0-shot) | 8.39| |
|
|MuSR (0-shot) | 7.92| |
|
|MMLU-PRO (5-shot) | 31.39| |
|
|
|
|