In [1]:
!pip install -q peft transformers datasets huggingface_hub
!pip install flash-attn --no-build-isolation

[0m

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftConfig
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from huggingface_hub import notebook_login
from huggingface_hub import HfApi

In [17]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
api = HfApi()
api.upload_file(path_or_fileobj='prompt_tune_phi3.ipynb',
                path_in_repo='prompt_tune_phi3.ipynb',
                repo_id='Granther/prompt-tuned-phi3',
                repo_type='model'
               )

CommitInfo(commit_url='https://huggingface.co/Granther/prompt-tuned-phi3/commit/ab5911db092a8e53ea24c33f170e8013a8b172aa', commit_message='Upload prompt_tune_phi3.ipynb with huggingface_hub', commit_description='', oid='ab5911db092a8e53ea24c33f170e8013a8b172aa', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
device = 'cuda'

model_id = 'microsoft/Phi-3-mini-128k-instruct'

peft_conf = PromptTuningConfig(
    peft_type=PeftType.PROMPT_TUNING, # what kind of peft
    task_type=TaskType.CAUSAL_LM,     # config task
    prompt_tuning_init=PromptTuningInit.TEXT, # Set to 'TEXT' to use prompt_tuning_init_text
    num_virtual_tokens=100, # x times the number of hidden transformer layers
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_id
)

dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_id}_{peft_conf.peft_type}_{peft_conf.task_type}_v1.pt".replace(
    "/", "_"
)

text_col = 'Tweet text'
label_col = 'text_label'
max_len = 64
lr = 3e-2
epochs = 5
batch_size = 8

In [5]:
dataset = load_dataset('ought/raft', dataset_name)

In [11]:
dataset['train'].features['Label'].names
#>>> ['Unlabeled', 'complaint', 'no complaint']

['Unlabeled', 'complaint', 'no complaint']

In [7]:
# Create lambda function
classes = [k.replace('_', ' ') for k in dataset['train'].features['Label'].names]
dataset = dataset.map(
    lambda x: {'text_label': [classes[label] for label in x['Label']]},
    batched=True,
    num_proc=10,
)

dataset['train'][0]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token_id == None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

target_max_len = max([len(tokenizer(class_lab)['input_ids']) for class_lab in classes])
target_max_len # max length for tokenized labels

tokenizer(classes[0])['input_ids'] 
# Ids corresponding to the tokens in the sequence
# Attention mask is a binary tensor used in the transformer block to differentiate between padding tokens and meaningful ones

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[1, 853, 29880, 24025]

### Preprocess Function:
- Tokenize text and label
- Pad each example in the batch with tok.pad_token_id
- 

In [14]:
def preproc(example):
    batch_size = len(example[text_col])
    inputs = [f"{text_col} : {x} Label : " for x in example[text_col]]
    # This is the text data that will be tokenized as the model input
    targets = [str(x) for x in example[label_col]]
    # Define batch of targets corresponding to inputs
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    # Tokenize

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id] # Appends to `input_ids` and not i

        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        # Afer tokenization, concatinate
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        #>>> -100, -100, -100, -100, -100, -100, -100, -100, 1, 694, 15313, 524, 32000
        # Pad the beginning of the sequence with n -100s (ignore tokens)
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (target_max_len - len(sample_input_ids)) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (target_max_len - len(sample_input_ids)) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [-100] * (target_max_len - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:target_max_len])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:target_max_len])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:target_max_len])
        model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
processed_datasets = dataset.map(
    preproc,
    batched=True, # uses default batch size
    num_proc=10,
    remove_columns=dataset["train"].column_names, # All columns from the original dataset will be removed in the new dataset
    load_from_cache_file=False,
    desc="Preprocessing dataset"
)

Preprocessing dataset (num_proc=10):   0%|          | 0/50 [00:00<?, ? examples/s]

Preprocessing dataset (num_proc=10):   0%|          | 0/3399 [00:00<?, ? examples/s]

In [16]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(train_dataset,
                              shuffle=True, # shuffling is unneccasary since we are not training
                              collate_fn=default_data_collator,
                              batch_size=batch_size,
                              pin_memory=True # pin memory when using a GPU, makes loading data faster
                             )

eval_dataloader = DataLoader(eval_dataset,
                              shuffle=False,
                              collate_fn=default_data_collator,
                              batch_size=batch_size,
                              pin_memory=True
                              )

In [17]:
model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
model = get_peft_model(model, peft_conf)

# the rest of the model is frozen
print(model.print_trainable_parameters())

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 307,200 || all params: 3,821,386,752 || trainable%: 0.0080
None


In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * epochs),
)

In [19]:
model = model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/7 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 7/7 [00:01<00:00,  5.36it/s]
100%|██████████| 425/425 [00:29<00:00, 14.23it/s]


epoch=0: train_ppl=tensor(nan, device='cuda:0') train_epoch_loss=tensor(nan, device='cuda:0') eval_ppl=tensor(nan, device='cuda:0') eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  7.66it/s]
100%|██████████| 425/425 [00:29<00:00, 14.26it/s]


epoch=1: train_ppl=tensor(nan, device='cuda:0') train_epoch_loss=tensor(nan, device='cuda:0') eval_ppl=tensor(nan, device='cuda:0') eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  7.76it/s]
100%|██████████| 425/425 [00:29<00:00, 14.25it/s]


epoch=2: train_ppl=tensor(nan, device='cuda:0') train_epoch_loss=tensor(nan, device='cuda:0') eval_ppl=tensor(nan, device='cuda:0') eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  7.72it/s]
100%|██████████| 425/425 [00:29<00:00, 14.24it/s]


epoch=3: train_ppl=tensor(nan, device='cuda:0') train_epoch_loss=tensor(nan, device='cuda:0') eval_ppl=tensor(nan, device='cuda:0') eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  7.77it/s]
100%|██████████| 425/425 [00:29<00:00, 14.18it/s]

epoch=4: train_ppl=tensor(nan, device='cuda:0') train_epoch_loss=tensor(nan, device='cuda:0') eval_ppl=tensor(nan, device='cuda:0') eval_epoch_loss=tensor(nan, device='cuda:0')





In [20]:
model.save_pretrained('model')

In [10]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer 

#tokenizer = AutoTokenizer.from_pretrained('model')

config = PeftConfig.from_pretrained('model')
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, 'model')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
inputs = tokenizer(
    f'{text_col} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
    return_tensors="pt",
)

In [15]:
model.to(device)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    out = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])#, max_new_tokens=10) #, eos_token_id=3)
    #print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))



ValueError: Input length of input_ids is 32, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [24]:
pipe("@HMRCcustomers No this is my first job")

[{'generated_text': '@HMRCcustomers No this is my first job and I am not sure what to do. I have been told that I need to register with HMRC but I am not sure how to do this. Can you please help me?\n\n### response\nTo register with HMRC for your first job, you need to complete a Self Assessment tax return if you are self-employed or have income to report. For employees, you may need to complete'}]