feat: check train executable
Browse files- article_base_train_no_qlora_test.py +109 -0
- article_base_train_test.py +33 -28
- test_inference.py +24 -0
article_base_train_no_qlora_test.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, time, math
|
2 |
+
import pandas as pd
|
3 |
+
from datasets import Dataset
|
4 |
+
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from peft import get_peft_model, LoraConfig
|
8 |
+
|
9 |
+
# Function to load custom dataset from CSV
|
10 |
+
def load_custom_dataset_from_csv(csv_file, image_folder):
|
11 |
+
# Load CSV data using pandas
|
12 |
+
data = pd.read_csv(csv_file)
|
13 |
+
|
14 |
+
# Prepare dataset format for Hugging Face
|
15 |
+
questions = data['question'].tolist()
|
16 |
+
images = [os.path.join(image_folder, img) for img in data['image'].tolist()]
|
17 |
+
answers = data['answer'].tolist()
|
18 |
+
|
19 |
+
# Create a Hugging Face dataset from the loaded CSV
|
20 |
+
return Dataset.from_dict({
|
21 |
+
'question': questions,
|
22 |
+
'image': images,
|
23 |
+
'answer': answers
|
24 |
+
})
|
25 |
+
|
26 |
+
# Main training function
|
27 |
+
def main():
|
28 |
+
# Load custom datasets
|
29 |
+
dataset = load_custom_dataset_from_csv('dataset/train_samples.csv', 'dataset/images/train')
|
30 |
+
train_val_split = dataset.train_test_split(test_size=0.1)
|
31 |
+
|
32 |
+
train_ds = train_val_split['train']
|
33 |
+
val_ds = train_val_split['test']
|
34 |
+
# train_ds = load_custom_dataset_from_csv('dataset/train_samples.csv', 'dataset/images')
|
35 |
+
# val_ds = load_custom_dataset_from_csv('dataset/val.csv', 'dataset/images')
|
36 |
+
|
37 |
+
model_id = "google/paligemma-3b-pt-224"
|
38 |
+
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
39 |
+
device = "cuda"
|
40 |
+
|
41 |
+
# bnb_config = BitsAndBytesConfig(
|
42 |
+
# load_in_4bit=True,
|
43 |
+
# bnb_4bit_quant_type="nf4",
|
44 |
+
# # bnb_4bit_compute_type=torch.bfloat16,
|
45 |
+
# # bnb_4bit_compute_type=torch.float16
|
46 |
+
# bnb_4bit_compute_dtype=torch.bfloat16
|
47 |
+
# # bnb_4bit_use_double_quant=True,
|
48 |
+
# )
|
49 |
+
# lora_config = LoraConfig(
|
50 |
+
# r=8,
|
51 |
+
# target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
|
52 |
+
# task_type="CAUSAL_LM"
|
53 |
+
# )
|
54 |
+
|
55 |
+
# model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
|
56 |
+
# model.gradient_checkpointing_enable()
|
57 |
+
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
|
58 |
+
for param in model.vision_tower.parameters():
|
59 |
+
param.requires_grad = False
|
60 |
+
|
61 |
+
for param in model.multi_modal_projector.parameters():
|
62 |
+
param.requires_grad = True
|
63 |
+
|
64 |
+
# model.print_trainable_parameters()
|
65 |
+
|
66 |
+
args = TrainingArguments(
|
67 |
+
output_dir=f"./output/{math.floor(time.time())}",
|
68 |
+
num_train_epochs=2,
|
69 |
+
remove_unused_columns=False,
|
70 |
+
# per_device_train_batch_size=16,
|
71 |
+
per_device_train_batch_size=1,
|
72 |
+
gradient_accumulation_steps=4,
|
73 |
+
warmup_steps=2,
|
74 |
+
learning_rate=2e-5,
|
75 |
+
weight_decay=1e-6,
|
76 |
+
logging_steps=100,
|
77 |
+
# optim="paged_adamw_8bit",
|
78 |
+
optim="adamw_hf",
|
79 |
+
save_strategy="steps",
|
80 |
+
save_steps=1000,
|
81 |
+
save_total_limit=1,
|
82 |
+
bf16=True,
|
83 |
+
report_to=["tensorboard"],
|
84 |
+
dataloader_pin_memory=False
|
85 |
+
)
|
86 |
+
|
87 |
+
# Custom collate function
|
88 |
+
def collate_fn(examples):
|
89 |
+
# texts = ["answer " + example["question"] for example in examples]
|
90 |
+
texts = [example["question"] for example in examples]
|
91 |
+
labels = [example['answer'] for example in examples]
|
92 |
+
# images = [Image.open(image_path).convert("RGB") for image_path in examples['image']]
|
93 |
+
images = [Image.open(example['image']).convert("RGB") for example in examples]
|
94 |
+
tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
|
95 |
+
tokens = tokens.to(torch.bfloat16).to(device)
|
96 |
+
return tokens
|
97 |
+
|
98 |
+
trainer = Trainer(
|
99 |
+
model=model,
|
100 |
+
train_dataset=train_ds,
|
101 |
+
eval_dataset=val_ds,
|
102 |
+
data_collator=collate_fn,
|
103 |
+
args=args
|
104 |
+
)
|
105 |
+
|
106 |
+
trainer.train()
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
main()
|
article_base_train_test.py
CHANGED
@@ -1,50 +1,50 @@
|
|
1 |
-
import os
|
2 |
-
import
|
3 |
-
from datasets import
|
4 |
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
|
5 |
import torch
|
6 |
from PIL import Image
|
7 |
from peft import get_peft_model, LoraConfig
|
8 |
|
9 |
-
# Function to load custom dataset
|
10 |
-
def
|
11 |
-
|
12 |
-
|
13 |
|
14 |
# Prepare dataset format for Hugging Face
|
15 |
-
questions = []
|
16 |
-
images = []
|
17 |
-
answers = []
|
18 |
-
multiple_choice_answers = []
|
19 |
-
|
20 |
-
for item in data:
|
21 |
-
questions.append(item['question'])
|
22 |
-
images.append(os.path.join(image_folder, item['image_id']))
|
23 |
-
answers.append(item['answer'])
|
24 |
-
multiple_choice_answers.append(item['multiple_choice_answer'])
|
25 |
|
|
|
26 |
return Dataset.from_dict({
|
27 |
'question': questions,
|
28 |
'image': images,
|
29 |
-
'answer': answers
|
30 |
-
'multiple_choice_answer': multiple_choice_answers
|
31 |
})
|
32 |
|
33 |
# Main training function
|
34 |
def main():
|
35 |
-
# Load custom
|
36 |
-
|
37 |
-
|
38 |
|
|
|
|
|
|
|
|
|
|
|
39 |
model_id = "google/paligemma-3b-pt-224"
|
40 |
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
41 |
-
image_token = processor.tokenizer.convert_tokens_to_ids("<image>")
|
42 |
device = "cuda"
|
43 |
|
44 |
bnb_config = BitsAndBytesConfig(
|
45 |
load_in_4bit=True,
|
46 |
bnb_4bit_quant_type="nf4",
|
47 |
-
bnb_4bit_compute_type=torch.bfloat16
|
|
|
|
|
|
|
48 |
)
|
49 |
lora_config = LoraConfig(
|
50 |
r=8,
|
@@ -53,13 +53,16 @@ def main():
|
|
53 |
)
|
54 |
|
55 |
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
|
|
|
56 |
model = get_peft_model(model, lora_config)
|
57 |
model.print_trainable_parameters()
|
58 |
|
59 |
args = TrainingArguments(
|
|
|
60 |
num_train_epochs=2,
|
61 |
remove_unused_columns=False,
|
62 |
-
per_device_train_batch_size=16,
|
|
|
63 |
gradient_accumulation_steps=4,
|
64 |
warmup_steps=2,
|
65 |
learning_rate=2e-5,
|
@@ -76,9 +79,11 @@ def main():
|
|
76 |
|
77 |
# Custom collate function
|
78 |
def collate_fn(examples):
|
79 |
-
texts = ["answer " + example["question"] for example in examples]
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
|
83 |
tokens = tokens.to(torch.bfloat16).to(device)
|
84 |
return tokens
|
|
|
1 |
+
import os, time, math
|
2 |
+
import pandas as pd
|
3 |
+
from datasets import Dataset
|
4 |
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
|
5 |
import torch
|
6 |
from PIL import Image
|
7 |
from peft import get_peft_model, LoraConfig
|
8 |
|
9 |
+
# Function to load custom dataset from CSV
|
10 |
+
def load_custom_dataset_from_csv(csv_file, image_folder):
|
11 |
+
# Load CSV data using pandas
|
12 |
+
data = pd.read_csv(csv_file)
|
13 |
|
14 |
# Prepare dataset format for Hugging Face
|
15 |
+
questions = data['question'].tolist()
|
16 |
+
images = [os.path.join(image_folder, img) for img in data['image'].tolist()]
|
17 |
+
answers = data['answer'].tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
# Create a Hugging Face dataset from the loaded CSV
|
20 |
return Dataset.from_dict({
|
21 |
'question': questions,
|
22 |
'image': images,
|
23 |
+
'answer': answers
|
|
|
24 |
})
|
25 |
|
26 |
# Main training function
|
27 |
def main():
|
28 |
+
# Load custom datasets
|
29 |
+
dataset = load_custom_dataset_from_csv('dataset/train_samples.csv', 'dataset/images/train')
|
30 |
+
train_val_split = dataset.train_test_split(test_size=0.1)
|
31 |
|
32 |
+
train_ds = train_val_split['train']
|
33 |
+
val_ds = train_val_split['test']
|
34 |
+
# train_ds = load_custom_dataset_from_csv('dataset/train_samples.csv', 'dataset/images')
|
35 |
+
# val_ds = load_custom_dataset_from_csv('dataset/val.csv', 'dataset/images')
|
36 |
+
|
37 |
model_id = "google/paligemma-3b-pt-224"
|
38 |
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
|
|
39 |
device = "cuda"
|
40 |
|
41 |
bnb_config = BitsAndBytesConfig(
|
42 |
load_in_4bit=True,
|
43 |
bnb_4bit_quant_type="nf4",
|
44 |
+
# bnb_4bit_compute_type=torch.bfloat16,
|
45 |
+
# bnb_4bit_compute_type=torch.float16
|
46 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
47 |
+
# bnb_4bit_use_double_quant=True,
|
48 |
)
|
49 |
lora_config = LoraConfig(
|
50 |
r=8,
|
|
|
53 |
)
|
54 |
|
55 |
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
|
56 |
+
# model.gradient_checkpointing_enable()
|
57 |
model = get_peft_model(model, lora_config)
|
58 |
model.print_trainable_parameters()
|
59 |
|
60 |
args = TrainingArguments(
|
61 |
+
output_dir=f"./output/{math.floor(time.time())}",
|
62 |
num_train_epochs=2,
|
63 |
remove_unused_columns=False,
|
64 |
+
# per_device_train_batch_size=16,
|
65 |
+
per_device_train_batch_size=4,
|
66 |
gradient_accumulation_steps=4,
|
67 |
warmup_steps=2,
|
68 |
learning_rate=2e-5,
|
|
|
79 |
|
80 |
# Custom collate function
|
81 |
def collate_fn(examples):
|
82 |
+
# texts = ["answer " + example["question"] for example in examples]
|
83 |
+
texts = [example["question"] for example in examples]
|
84 |
+
labels = [example['answer'] for example in examples]
|
85 |
+
# images = [Image.open(image_path).convert("RGB") for image_path in examples['image']]
|
86 |
+
images = [Image.open(example['image']).convert("RGB") for example in examples]
|
87 |
tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
|
88 |
tokens = tokens.to(torch.bfloat16).to(device)
|
89 |
return tokens
|
test_inference.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
2 |
+
from PIL import Image
|
3 |
+
|
4 |
+
|
5 |
+
def main():
|
6 |
+
model_id = "google/paligemma-3b-pt-224"
|
7 |
+
# model_path = "output/1727488022/checkpoint-112"
|
8 |
+
model_path = "output/1727490265/checkpoint-450"
|
9 |
+
model = PaliGemmaForConditionalGeneration.from_pretrained(model_path)
|
10 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
11 |
+
|
12 |
+
# prompt = "Analyze image from a critic's point of view."
|
13 |
+
prompt = "Please construct a formal analysis paragraph that is coherent and focuses solely on visual characteristic."
|
14 |
+
image_file_path = "dataset/images/manual_test/starry_night.jpg"
|
15 |
+
raw_image = Image.open(image_file_path)
|
16 |
+
inputs = processor(prompt, raw_image, return_tensors="pt")
|
17 |
+
output = model.generate(**inputs, max_new_tokens=20)
|
18 |
+
|
19 |
+
# Starry Night
|
20 |
+
print("Response: ", processor.decode(output[0], skip_special_tokens=True)[len(prompt):])
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
main()
|