Image-Text-to-Text
PEFT
Safetensors
English
IDEFICS3_ROCO / learn.py
eltorio
refactor docker image
1392114
# Copyright (C) 2024 Ronan Le Meillat
# License: Apache License 2.0
# Description: Train the model on the dataset
import os
import torch
from huggingface_hub import login as hf_login
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration, TrainingArguments, Trainer
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
HF_TOKEN = ""
if os.environ.get('HF_TOKEN') is not None:
HF_TOKEN = os.environ.get('HF_TOKEN')
print(f"Hugging Face token found in environment variable")
hf_login(
token=HF_TOKEN,
add_to_git_credential=True
)
dataset_id = "eltorio/ROCO-radiology"
prompt= "You are an expert radiologist certified with over 15 years of experience in diagnostic imaging, describe this image"
source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
destination_model_id = "eltorio/ROCO-idefics3-8B"
output_dir = "IDEFICS3_ROCO"
cache_dir = "/workspace/data"
train_dataset = load_dataset(dataset_id, split="train", cache_dir=cache_dir)
DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = True
processor = AutoProcessor.from_pretrained(
source_model_id,
do_image_splitting=False
)
if USE_QLORA or USE_LORA:
lora_config = LoraConfig(
r=8,
lora_alpha=8,
lora_dropout=0.1,
target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
use_dora=False if USE_QLORA else True,
init_lora_weights="gaussian"
)
if USE_QLORA:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model = Idefics3ForConditionalGeneration.from_pretrained(
source_model_id,
torch_dtype=torch.float16,
quantization_config=bnb_config if USE_QLORA else None,
)
model.add_adapter(lora_config)
model.enable_adapters()
else:
model = Idefics3ForConditionalGeneration.from_pretrained(
source_model_id,
torch_dtype=torch.float16,
_attn_implementation="flash_attention_2", # This works for A100 or H100
).to(DEVICE)
class MyDataCollator:
def __init__(self, processor):
self.processor = processor
self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
processor.tokenizer.additional_special_tokens.index("<image>")
]
def __call__(self, samples):
texts = []
images = []
for sample in samples:
image = sample["image"]
answer = sample["caption"]
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": prompt}
]
},
{
"role": "user",
"content": [
{"type": "image"},
]
},
{
"role": "assistant",
"content": [
{"type": "text", "text": answer}
]
}
]
text = processor.apply_chat_template(messages, add_generation_prompt=False)
texts.append(text.strip())
images.append([image.convert('RGB')])
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
batch["labels"] = labels
return batch
data_collator = MyDataCollator(processor)
training_args = TrainingArguments(
output_dir = output_dir,
overwrite_output_dir = False,
auto_find_batch_size = True,
learning_rate = 2e-4,
fp16 = True,
per_device_train_batch_size = 2,
per_device_eval_batch_size = 2,
gradient_accumulation_steps = 8,
dataloader_pin_memory = False,
save_total_limit = 3,
evaluation_strategy = None,
save_strategy = "steps",
eval_steps = 100,
save_steps = 10, # checkpoint each 10 steps
resume_from_checkpoint = True,
logging_steps = 5,
remove_unused_columns = False,
push_to_hub = True,
label_names = ["labels"],
load_best_model_at_end = False,
report_to = "none",
optim = "paged_adamw_8bit",
)
trainer = Trainer(
model = model,
args = training_args,
data_collator = data_collator,
train_dataset = train_dataset,
)
trainer.train()