|
--- |
|
license: mit |
|
--- |
|
|
|
code used to train |
|
- https://gist.github.com/thistleknot/398466a4bd75a1315825d7f04db635a6/91194e69c5220e536f45536e6b7dd66995c847b1 |
|
|
|
currently setup to translate individual premises (sentences) |
|
|
|
#datasets |
|
|
|
- Yale-LILY/FOLIO |
|
- apergo-ai/text2log (1661 records) |
|
|
|
how to load |
|
|
|
``` |
|
device = "cuda" |
|
model_name_or_path = "microsoft/Phi-3-mini-4k-instruct" |
|
|
|
model = transformers.AutoModelForCausalLM.from_pretrained( |
|
model_name_or_path, torch_dtype=torch.bfloat16, device_map=device) |
|
|
|
reft_model = pyreft.ReftModel.load( |
|
"LaferriereJC/Phi-3-mini-4k-instruct-FOL-pyreft", model |
|
) |
|
``` |
|
|
|
how to use |
|
``` |
|
!git clone https://huggingface.co/LaferriereJC/Phi-3-mini-4k-instruct-FOL-pyreft |
|
from transformers import AutoModelForCausalLM |
|
import torch |
|
import pyreft |
|
import os |
|
import transformers |
|
|
|
device = 'cuda' |
|
model_name_or_path = "microsoft/Phi-3-mini-4k-instruct" |
|
|
|
attn_implementation = "eager" |
|
torch_dtype = torch.float16 |
|
#"microsoft/Phi-3-mini-4k-instruct" |
|
|
|
model = transformers.AutoModelForCausalLM.from_pretrained( |
|
model_name_or_path, torch_dtype=torch.bfloat16, device_map=device,trust_remote_code=True) |
|
|
|
|
|
# Define the PyReFT configuration |
|
layers = range(model.config.num_hidden_layers) |
|
representations = [{ |
|
"component": f"model.layers[{l}].output", |
|
"intervention": pyreft.LoreftIntervention( |
|
embed_dim=model.config.hidden_size, |
|
low_rank_dimension=16 |
|
) |
|
} for l in layers] |
|
|
|
reft_config = pyreft.ReftConfig(representations=representations) |
|
|
|
# Initialize the PyReFT model |
|
reft_model = pyreft.get_reft_model(model, reft_config) |
|
|
|
# Load the saved PyReFT model |
|
local_directory = "./Phi-3-mini-4k-instruct-FOL-pyreft" |
|
interventions = {} |
|
for l in layers: |
|
component = f"model.layers[{l}].output" |
|
file_path = os.path.join(local_directory, f"intkey_comp.{component}.unit.pos.nunit.1#0.bin") |
|
if os.path.exists(file_path): |
|
with open(file_path, "rb") as f: |
|
adjusted_key = f"comp.{component}.unit.pos.nunit.1#0" |
|
interventions[adjusted_key] = torch.load(f) |
|
|
|
# Apply the loaded weights to the model |
|
for component, state_dict in interventions.items(): |
|
if component in reft_model.interventions: |
|
reft_model.interventions[component][0].load_state_dict(state_dict) |
|
else: |
|
print(f"Key mismatch: {component} not found in reft_model.interventions") |
|
|
|
# Set the device to CUDA |
|
reft_model.set_device("cuda") |
|
|
|
# Verify the model |
|
reft_model.print_trainable_parameters() |
|
|
|
#model.half() |
|
# get tokenizer |
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
model_name_or_path, model_max_length=216, |
|
padding_side="right", use_fast=True, |
|
attn_implementation=attn_implementation |
|
#, add_eos_token=True, add_bos_token=True |
|
) |
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
# position info about the interventions |
|
share_weights = True # whether the prefix and suffix interventions sharing weights. |
|
positions="f3+l3" # the intervening positions of prefix tokens (f[irst]1) and suffix tokens (l[ast]1). |
|
first_n, last_n = pyreft.parse_positions(positions) |
|
|
|
terminators = [ |
|
tokenizer.eos_token_id, |
|
] |
|
|
|
prompt_no_input_template = """\n<|user|>:%s</s>\n<|assistant|>:""" |
|
|
|
test_instruction = f"""tell me something I don't know""" |
|
# tokenize and prepare the input |
|
prompt = prompt_no_input_template % test_instruction |
|
prompt = tokenizer(prompt, return_tensors="pt").to(device) |
|
|
|
unit_locations = torch.IntTensor([pyreft.get_intervention_locations( |
|
last_position=prompt["input_ids"].shape[-1], |
|
first_n=first_n, |
|
last_n=last_n, |
|
pad_mode="last", |
|
num_interventions=len(reft_config.representations), |
|
share_weights=share_weights |
|
)]).permute(1, 0, 2).tolist() |
|
|
|
_, reft_response = reft_model.generate( |
|
prompt, unit_locations={"sources->base": (None, unit_locations)}, |
|
intervene_on_prompt=True, max_new_tokens=537, do_sample=True, top_k=50,temperature=0.7, |
|
eos_token_id=terminators, early_stopping=True |
|
) |
|
print(tokenizer.decode(reft_response[0], skip_special_tokens=True)) |
|
|
|
|
|
``` |
|
|
|
response |
|
``` |
|
:tell me something I don't know</s> :exists x1.(_thing(x1) & _donknow(x1)) |
|
``` |
|
|
|
training settings |
|
``` |
|
per_device_train_batch_size=6, |
|
logging_steps=1, |
|
optim='paged_lion_8bit', |
|
gradient_checkpointing_kwargs={"use_reentrant": False}, |
|
learning_rate=0.0003, |
|
warmup_ratio=.1, |
|
adam_beta2=0.95, |
|
adam_epsilon=0.00001, |
|
save_strategy='epoch', |
|
max_grad_norm=1.0, |
|
lr_scheduler_type='cosine', |
|
``` |
|
|
|
Evaluation: |
|
|
|
I kept tweaking the model until I got confirmations from chatgpt 4, but the final training error (1 epoch) came in consistently under .5 (10 point EMA with alpha of .42) |
|
Loss |
|
|
|
![image/png](https://cdn-uploads.huggingface.co/production/uploads/62578ad28c6638f8a93e8856/Y_uAhaHH3ko6Z6tjYSy0y.png) |
|
|
|
|
|
:tell me something I don't know</s> :exists x1.(_thing(x1) & _donknow(x1)) |
|
Does the fol expression fit? |
|
|
|
Depending on how I asked (for example, it would almost always suggest revisions if I asked |
|
- Is the fol expression adequate? |
|
- How faithful is the fol expression? |
|
|
|
) |
|
|
|
|