In [1]:
!pip install huggingface_hub
!pip install -U datasets peft transformers[torch]
!pip install -q bitsandbytes trl accelerate
!pip install flash-attn --no-build-isolation



In [1]:
import json
import re
from pprint import pprint
 
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer
import re

In [2]:
torch.cuda.set_per_process_memory_fraction(0.8) 

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [4]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

In [5]:
data = load_dataset("AI-4-Health/merged-datasets")

In [6]:
# data['train']['filename']

In [7]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
 
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )
 
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
 
    return model, tokenizer

In [35]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# model.gradient_checkpointing_enable()

In [10]:
data['train']['text'][0]

"\n==== Front\nKnee Surg Sports Traumatol Arthrosc\nKnee Surg Sports Traumatol Arthrosc\nKnee Surgery, Sports Traumatology, Arthroscopy\n0942-2056\n1433-7347\nSpringer Berlin Heidelberg Berlin/Heidelberg\n\n32968845\n6290\n10.1007/s00167-020-06290-0\nSports Traumatology\nBilateral Looser zones or pseudofractures in the anteromedial tibia as a component of medial tibial stress syndrome in athletes\nStÃ¼rznickel Julian 1\nJandl Nico Maximilian 12\nDelsmann Maximilian M. 1\nvon Vopelius Emil 1\nBarvencik Florian 1\nhttp://orcid.org/0000-0003-2382-8348\nAmling Michael amling@uke.de\n\n1\nUeblacker Peter 13\nRolvien Tim 12\nOheim Ralf 1\n1 grid.13648.38 0000 0001 2180 3484 Department of Osteology and Biomechanics, University Medical Center Hamburg-Eppendorf, LottestraÃŸe 59, 22529 Hamburg, Germany\n2 grid.13648.38 0000 0001 2180 3484 Department of Orthopedics, University Medical Center Hamburg-Eppendorf, Hamburg, Germany\n3 Orthopedics and Sports Medicine, Munich, Germany\n23 9 2020\n23 9 2

In [11]:
# data preprocessing
def clean_text(text):
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = text.replace('====', ' ')
    text = text.replace('\n', ' ') # replace newline with space
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_function(examples):
    examples['text'] = [clean_text(text) for text in examples['text']]
    return examples

dataset = data['train'].map(preprocess_function, batched=True)

In [12]:
dataset['text'][0]

" Front Knee Surg Sports Traumatol Arthrosc Knee Surg Sports Traumatol Arthrosc Knee Surgery, Sports Traumatology, Arthroscopy 0942-2056 1433-7347 Springer Berlin Heidelberg Berlin/Heidelberg 32968845 6290 10.1007/s00167-020-06290-0 Sports Traumatology Bilateral Looser zones or pseudofractures in the anteromedial tibia as a component of medial tibial stress syndrome in athletes StÃ¼rznickel Julian 1 Jandl Nico Maximilian 12 Delsmann Maximilian M. 1 von Vopelius Emil 1 Barvencik Florian 1 Amling Michael amling@uke.de 1 Ueblacker Peter 13 Rolvien Tim 12 Oheim Ralf 1 1 grid.13648.38 0000 0001 2180 3484 Department of Osteology and Biomechanics, University Medical Center Hamburg-Eppendorf, LottestraÃŸe 59, 22529 Hamburg, Germany 2 grid.13648.38 0000 0001 2180 3484 Department of Orthopedics, University Medical Center Hamburg-Eppendorf, Hamburg, Germany 3 Orthopedics and Sports Medicine, Munich, Germany 23 9 2020 23 9 2020 2021 29 5 16441650 5 6 2020 14 9 2020 Â© The Author(s) 2020 Open Acces

In [36]:
# QLoRA config
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    # "up_proj",
    # "o_proj",
    # "k_proj",
    # "down_proj",
    # "gate_proj",
    "v_proj",
]
 
 
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [37]:
OUTPUT_DIR = "experiments"
 
%load_ext tensorboard
%tensorboard --logdir experiments/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 361373), started 9:35:12 ago. (Use '!kill 361373' to kill it.)

In [38]:
from peft import get_peft_model

In [39]:
model = get_peft_model(model, peft_config)

In [40]:
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [41]:
from transformers import DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig

In [42]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

In [50]:
training_arguments = SFTConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.01,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)



In [51]:
splitted_dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [57]:
trainer = SFTTrainer(
    model=model,
    train_dataset=splitted_dataset['train'],
    eval_dataset=splitted_dataset['test'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

NameError: name 'EarlyStoppingCallback' is not defined

In [53]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [54]:
# torch.cuda.synchronize()
# torch.cuda.empty_cache()

In [55]:
trainer.train()

Step,Training Loss,Validation Loss
4,1.9812,1.774238
8,1.7535,1.754978
12,1.7726,1.726179
16,1.5575,1.692598
20,1.7902,1.662171
24,1.5989,1.633827
28,1.5265,1.615106
32,1.5606,1.600001
36,1.6185,1.588
40,1.6466,1.579654




KeyboardInterrupt: 

In [56]:
trainer.push_to_hub("AI-4-Health/HPP-FINETUNED-Meta-Llama-3-8B-Instruct")



training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

events.out.tfevents.1718234993.jupiter.364556.2:   0%|          | 0.00/61.6k [00:00<?, ?B/s]

events.out.tfevents.1718234875.jupiter.364556.1:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/shin00001/experiments/commit/61a909cf186f09d62099fa9b62a70519fd2be619', commit_message='AI-4-Health/HPP-FINETUNED-Meta-Llama-3-8B-Instruct', commit_description='', oid='61a909cf186f09d62099fa9b62a70519fd2be619', pr_url=None, pr_revision=None, pr_num=None)