Update README.md

0f5b018 verified 7 days ago

9.53 kB

	---
	library_name: transformers
	tags:
	- mergekit
	- merge
	- llama-3.1
	- roleplay
	- function calling
	base_model:
	- T145/ZEUS-8B-V2
	license: llama3.1
	model-index:
	- name: ZEUS-8B-V2-abliterated
	results:
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: IFEval (0-Shot)
	type: wis-k/instruction-following-eval
	split: train
	args:
	num_few_shot: 0
	metrics:
	- type: inst_level_strict_acc and prompt_level_strict_acc
	value: 78.95
	name: averaged accuracy
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: BBH (3-Shot)
	type: SaylorTwift/bbh
	split: test
	args:
	num_few_shot: 3
	metrics:
	- type: acc_norm
	value: 30.98
	name: normalized accuracy
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: MATH Lvl 5 (4-Shot)
	type: lighteval/MATH-Hard
	split: test
	args:
	num_few_shot: 4
	metrics:
	- type: exact_match
	value: 20.62
	name: exact match
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: GPQA (0-shot)
	type: Idavidrein/gpqa
	split: train
	args:
	num_few_shot: 0
	metrics:
	- type: acc_norm
	value: 8.39
	name: acc_norm
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: MuSR (0-shot)
	type: TAUR-Lab/MuSR
	args:
	num_few_shot: 0
	metrics:
	- type: acc_norm
	value: 7.92
	name: acc_norm
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated
	name: Open LLM Leaderboard
	- task:
	type: text-generation
	name: Text Generation
	dataset:
	name: MMLU-PRO (5-shot)
	type: TIGER-Lab/MMLU-Pro
	config: main
	split: test
	args:
	num_few_shot: 5
	metrics:
	- type: acc
	value: 31.39
	name: accuracy
	source:
	url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V2-abliterated
	name: Open LLM Leaderboard
	---

	# ZEUS 8B 🌩️ V2 - ABLITERATED

	V2 abliterated using the following script:

	```python
	import gc
	import random

	import torch
	from tqdm import tqdm
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

	MODEL_ID = "T145/ZEUS-8B-V2"

	# More samples can help find the direction better.
	NUM_PROMPT_SAMPLES = 32

	# Used to skip the first and last layers for the modifications.
	SKIP_BEGIN_LAYERS = 1
	SKIP_END_LAYERS = 1

	# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
	LAYER_FRACTION_TO_USE = 0.6

	# Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
	SCALE_FACTOR = 1.0

	torch.inference_mode()
	torch.set_default_device("cpu")
	torch.set_grad_enabled(False)

	# Load the model on the GPU in quantized type if we can.
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch.float16,
	quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
	low_cpu_mem_usage=True,
	device_map='auto'
	)
	model.requires_grad_(False)

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)

	print("Layer index for refusal direction: " + str(layer_idx))

	with open("harmful.txt", "r", encoding="utf-8") as f:
	harmful = f.readlines()

	with open("harmless.txt", "r", encoding="utf-8") as f:
	harmless = f.readlines()

	harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful)))
	harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless)))

	harmful_toks = [
	tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
	return_tensors="pt") for insn in harmful_instructions]
	harmless_toks = [
	tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
	return_tensors="pt") for insn in harmless_instructions]

	bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples")

	# Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM.
	def generate(toks):
	inputs = tokenizer(toks, return_tensors="pt", padding=True)
	inputs = inputs.to(model.device)
	output = model.generate(
	inputs['input_ids'],
	use_cache=False,
	max_new_tokens=1,
	return_dict_in_generate=True,
	output_hidden_states=True,
	attention_mask=inputs["attention_mask"],
	pad_token_id=tokenizer.eos_token_id
	)
	bar_generate.update(n=1)
	return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1.

	harmful_hidden = [generate(toks) for toks in harmful_toks]
	harmless_hidden = [generate(toks) for toks in harmless_toks]

	bar_generate.close()

	harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
	harmless_mean = torch.stack(harmless_hidden).mean(dim=0)

	refusal_dir = harmful_mean - harmless_mean
	refusal_dir = refusal_dir.squeeze() / refusal_dir.norm()

	torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt")

	# Free memory
	del model
	gc.collect()
	torch.cuda.empty_cache()

	# Reload the model in CPU memory with bfloat16 data type
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	device_map='cpu'
	)
	model.requires_grad_(False)

	# Make sure it's on the 'cpu' device.
	if refusal_dir.device != model.device:
	refusal_dir = refusal_dir.to(model.device)

	# Get the language model component and check it's as expected.
	lm_model = model.model
	assert hasattr(lm_model, 'layers'), "The model does not have the expected structure."

	# Check the ranges are valid.
	num_layers = len(lm_model.layers)
	assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0."
	assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0."
	assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers."

	bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")

	# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
	def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
	assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
	tensor_float = tensor_data.to(torch.bfloat16)
	refusal_dir_float = refusal_dir.to(torch.bfloat16)
	tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float)
	tensor_modified = tensor_float.to(torch.bfloat16)
	bar_layers.update(1)
	return torch.nn.Parameter(tensor_modified)

	# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer.
	# NOTE: These tensors names are speific to "llama" and may need changing.
	# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
	for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
	lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
	lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
	)
	lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
	lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
	)

	bar_layers.close()

	print("Saving modified model (with original tokenizer)...")

	FIXED_ID = f"{MODEL_ID}-abliterated"
	model.save_pretrained(FIXED_ID)
	tokenizer.save_pretrained(FIXED_ID)
	```

	According to the script, layer 19 is the primary target for abliteration.

	# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
	Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/T145__ZEUS-8B-V2-abliterated-details)!
	Summarized results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q=T145%2FZEUS-8B-V2-abliterated&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc)!

	\| Metric \|Value (%)\|
	\|-------------------\|--------:\|
	\|Average \| 29.71\|
	\|IFEval (0-Shot) \| 78.95\|
	\|BBH (3-Shot) \| 30.98\|
	\|MATH Lvl 5 (4-Shot)\| 20.62\|
	\|GPQA (0-shot) \| 8.39\|
	\|MuSR (0-shot) \| 7.92\|
	\|MMLU-PRO (5-shot) \| 31.39\|