pizb commited on
Commit
1758c0c
·
1 Parent(s): 812fd7c

feat: merge baseline and add other format metadata

Browse files
article_base_train.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time, math
2
+ import pandas as pd
3
+ from datasets import Dataset
4
+ from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
5
+ import torch
6
+ from PIL import Image
7
+ from peft import get_peft_model, LoraConfig
8
+ import argparse
9
+
10
+
11
+ # Function to load custom dataset from CSV
12
+ def load_custom_dataset_from_csv(csv_file, image_folder):
13
+ # Load CSV data using pandas
14
+ data = pd.read_csv(csv_file)
15
+
16
+ # Prepare dataset format for Hugging Face
17
+ questions = data['question'].tolist()
18
+ images = [os.path.join(image_folder, img) for img in data['image'].tolist()]
19
+ answers = data['answer'].tolist()
20
+
21
+ # Create a Hugging Face dataset from the loaded CSV
22
+ return Dataset.from_dict({
23
+ 'question': questions,
24
+ 'image': images,
25
+ 'answer': answers
26
+ })
27
+
28
+
29
+ # Function to load custom dataset from Parquet
30
+ def load_custom_dataset_from_parquet(parquet_file, image_folder):
31
+ # Load Parquet data using pandas
32
+ data = pd.read_parquet(parquet_file)
33
+
34
+ # Prepare dataset format for Hugging Face
35
+ questions = data['question'].tolist()
36
+ images = [os.path.join(image_folder, img) for img in data['image'].tolist()]
37
+ answers = data['answer'].tolist()
38
+
39
+ # Create a Hugging Face dataset from the loaded Parquet
40
+ return Dataset.from_dict({
41
+ 'question': questions,
42
+ 'image': images,
43
+ 'answer': answers
44
+ })
45
+
46
+
47
+ # Choose the appropriate loader based on metadata_type argument
48
+ def load_dataset_by_type(metadata_type, dataset_dir, image_folder):
49
+ if metadata_type == "csv":
50
+ return load_custom_dataset_from_csv(
51
+ os.path.join(dataset_dir, 'train_samples.csv'),
52
+ image_folder
53
+ )
54
+ elif metadata_type == "parquet":
55
+ return load_custom_dataset_from_parquet(
56
+ os.path.join(dataset_dir, 'train.parquet'),
57
+ image_folder
58
+ )
59
+ else:
60
+ raise ValueError("Unsupported metadata type. Use 'csv' or 'parquet'.")
61
+
62
+
63
+ def load_model_and_args(use_qlora, model_id, device, output_dir):
64
+ if use_qlora:
65
+ bnb_config = BitsAndBytesConfig(
66
+ load_in_4bit=True,
67
+ bnb_4bit_quant_type="nf4",
68
+ bnb_4bit_compute_dtype=torch.bfloat16
69
+ )
70
+ lora_config = LoraConfig(
71
+ r=8,
72
+ target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
73
+ task_type="CAUSAL_LM"
74
+ )
75
+
76
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
77
+ model = get_peft_model(model, lora_config)
78
+ model.print_trainable_parameters()
79
+
80
+ # TODO: Customize training setting
81
+ args = TrainingArguments(
82
+ output_dir=os.path.join(output_dir, f"{math.floor(time.time())}"),
83
+ num_train_epochs=2,
84
+ remove_unused_columns=False,
85
+ per_device_train_batch_size=1,
86
+ gradient_accumulation_steps=4,
87
+ warmup_steps=2,
88
+ learning_rate=2e-5,
89
+ weight_decay=1e-6,
90
+ logging_steps=100,
91
+ optim="adamw_hf",
92
+ save_strategy="steps",
93
+ save_steps=1000,
94
+ save_total_limit=1,
95
+ bf16=True,
96
+ report_to=["tensorboard"],
97
+ dataloader_pin_memory=False
98
+ )
99
+
100
+ return model, args
101
+ else:
102
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
103
+ for param in model.vision_tower.parameters():
104
+ param.requires_grad = False
105
+
106
+ for param in model.multi_modal_projector.parameters():
107
+ param.requires_grad = True
108
+
109
+ # TODO: Customize training setting
110
+ args = TrainingArguments(
111
+ output_dir=os.path.join(output_dir, f"{math.floor(time.time())}"),
112
+ num_train_epochs=2,
113
+ remove_unused_columns=False,
114
+ per_device_train_batch_size=4,
115
+ gradient_accumulation_steps=4,
116
+ warmup_steps=2,
117
+ learning_rate=2e-5,
118
+ weight_decay=1e-6,
119
+ logging_steps=100,
120
+ optim="paged_adamw_8bit",
121
+ save_strategy="steps",
122
+ save_steps=1000,
123
+ save_total_limit=1,
124
+ bf16=True,
125
+ report_to=["tensorboard"],
126
+ dataloader_pin_memory=False
127
+ )
128
+
129
+ return model, args
130
+
131
+
132
+ # Main training function
133
+ def main(args):
134
+ dataset_dir = args.dataset_dir
135
+ model_id = args.model_id
136
+ output_dir = args.output_dir
137
+ metadata_type = args.metadata_type
138
+
139
+ # Load custom datasetsㄴ
140
+ # dataset = load_custom_dataset_from_csv(
141
+ # os.path.join(dataset_dir, 'train_samples.csv'),
142
+ # os.path.join(dataset_dir, 'images/train')) # TODO: change to appropriate path
143
+ dataset = load_dataset_by_type(metadata_type, dataset_dir, os.path.join(dataset_dir, 'images/train'))
144
+ train_val_split = dataset.train_test_split(test_size=0.1)
145
+
146
+ train_ds = train_val_split['train']
147
+ val_ds = train_val_split['test']
148
+
149
+ processor = PaliGemmaProcessor.from_pretrained(model_id)
150
+ device = "cuda"
151
+
152
+ model, args = load_model_and_args(args.use_qlora, model_id, device, output_dir)
153
+
154
+ # Custom collate function
155
+ def collate_fn(examples):
156
+ texts = [example["question"] for example in examples]
157
+ labels = [example['answer'] for example in examples]
158
+ images = [Image.open(example['image']).convert("RGB") for example in examples]
159
+ tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
160
+ tokens = tokens.to(torch.bfloat16).to(device)
161
+ return tokens
162
+
163
+ trainer = Trainer(
164
+ model=model,
165
+ train_dataset=train_ds,
166
+ eval_dataset=val_ds,
167
+ data_collator=collate_fn,
168
+ args=args
169
+ )
170
+
171
+ trainer.train()
172
+
173
+
174
+ def parse_args():
175
+ parser = argparse.ArgumentParser(description="Train a model with custom dataset")
176
+ parser.add_argument('--dataset_dir', type=str, default='./dataset', help='Path to the folder containing the images')
177
+ parser.add_argument('--model_id', type=str, default='google/paligemma-3b-pt-224', help='Model ID to use for training')
178
+ parser.add_argument('--output_dir', type=str, default='./output', help='Directory to save the output')
179
+ parser.add_argument('--use_qlora', type=bool, default=False, help='Use QLoRA for training')
180
+ parser.add_argument('--metadata_type', type=str, default='parquet', choices=['csv', 'parquet'], help='Metadata format (csv or parquet)')
181
+ return parser.parse_args()
182
+
183
+
184
+ if __name__ == "__main__":
185
+ args = parse_args()
186
+ main(args)
article_base_train_no_qlora_test.py DELETED
@@ -1,85 +0,0 @@
1
- import os, time, math
2
- import pandas as pd
3
- from datasets import Dataset
4
- from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
5
- import torch
6
- from PIL import Image
7
- from peft import get_peft_model, LoraConfig
8
-
9
- # Function to load custom dataset from CSV
10
- def load_custom_dataset_from_csv(csv_file, image_folder):
11
- # Load CSV data using pandas
12
- data = pd.read_csv(csv_file)
13
-
14
- # Prepare dataset format for Hugging Face
15
- questions = data['question'].tolist()
16
- images = [os.path.join(image_folder, img) for img in data['image'].tolist()]
17
- answers = data['answer'].tolist()
18
-
19
- # Create a Hugging Face dataset from the loaded CSV
20
- return Dataset.from_dict({
21
- 'question': questions,
22
- 'image': images,
23
- 'answer': answers
24
- })
25
-
26
- # Main training function
27
- def main():
28
- # Load custom datasets
29
- dataset = load_custom_dataset_from_csv('dataset/train_samples.csv', 'dataset/images/train')
30
- train_val_split = dataset.train_test_split(test_size=0.1)
31
-
32
- train_ds = train_val_split['train']
33
- val_ds = train_val_split['test']
34
-
35
- model_id = "google/paligemma-3b-pt-224"
36
- processor = PaliGemmaProcessor.from_pretrained(model_id)
37
- device = "cuda"
38
-
39
- model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
40
- for param in model.vision_tower.parameters():
41
- param.requires_grad = False
42
-
43
- for param in model.multi_modal_projector.parameters():
44
- param.requires_grad = True
45
-
46
- args = TrainingArguments(
47
- output_dir=f"./output/{math.floor(time.time())}",
48
- num_train_epochs=2,
49
- remove_unused_columns=False,
50
- per_device_train_batch_size=1,
51
- gradient_accumulation_steps=4,
52
- warmup_steps=2,
53
- learning_rate=2e-5,
54
- weight_decay=1e-6,
55
- logging_steps=100,
56
- optim="adamw_hf",
57
- save_strategy="steps",
58
- save_steps=1000,
59
- save_total_limit=1,
60
- bf16=True,
61
- report_to=["tensorboard"],
62
- dataloader_pin_memory=False
63
- )
64
-
65
- # Custom collate function
66
- def collate_fn(examples):
67
- texts = [example["question"] for example in examples]
68
- labels = [example['answer'] for example in examples]
69
- images = [Image.open(example['image']).convert("RGB") for example in examples]
70
- tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
71
- tokens = tokens.to(torch.bfloat16).to(device)
72
- return tokens
73
-
74
- trainer = Trainer(
75
- model=model,
76
- train_dataset=train_ds,
77
- eval_dataset=val_ds,
78
- data_collator=collate_fn,
79
- args=args
80
- )
81
-
82
- trainer.train()
83
-
84
- if __name__ == "__main__":
85
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
article_base_train_test.py DELETED
@@ -1,93 +0,0 @@
1
- import os, time, math
2
- import pandas as pd
3
- from datasets import Dataset
4
- from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
5
- import torch
6
- from PIL import Image
7
- from peft import get_peft_model, LoraConfig
8
-
9
- # Function to load custom dataset from CSV
10
- def load_custom_dataset_from_csv(csv_file, image_folder):
11
- # Load CSV data using pandas
12
- data = pd.read_csv(csv_file)
13
-
14
- # Prepare dataset format for Hugging Face
15
- questions = data['question'].tolist()
16
- images = [os.path.join(image_folder, img) for img in data['image'].tolist()]
17
- answers = data['answer'].tolist()
18
-
19
- # Create a Hugging Face dataset from the loaded CSV
20
- return Dataset.from_dict({
21
- 'question': questions,
22
- 'image': images,
23
- 'answer': answers
24
- })
25
-
26
- # Main training function
27
- def main():
28
- # Load custom datasets
29
- dataset = load_custom_dataset_from_csv('dataset/train_samples.csv', 'dataset/images/train')
30
- train_val_split = dataset.train_test_split(test_size=0.1)
31
-
32
- train_ds = train_val_split['train']
33
- val_ds = train_val_split['test']
34
-
35
- model_id = "google/paligemma-3b-pt-224"
36
- processor = PaliGemmaProcessor.from_pretrained(model_id)
37
- device = "cuda"
38
-
39
- bnb_config = BitsAndBytesConfig(
40
- load_in_4bit=True,
41
- bnb_4bit_quant_type="nf4",
42
- bnb_4bit_compute_dtype=torch.bfloat16
43
- )
44
- lora_config = LoraConfig(
45
- r=8,
46
- target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
47
- task_type="CAUSAL_LM"
48
- )
49
-
50
- model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
51
- model = get_peft_model(model, lora_config)
52
- model.print_trainable_parameters()
53
-
54
- args = TrainingArguments(
55
- output_dir=f"./output/{math.floor(time.time())}",
56
- num_train_epochs=2,
57
- remove_unused_columns=False,
58
- per_device_train_batch_size=4,
59
- gradient_accumulation_steps=4,
60
- warmup_steps=2,
61
- learning_rate=2e-5,
62
- weight_decay=1e-6,
63
- logging_steps=100,
64
- optim="paged_adamw_8bit",
65
- save_strategy="steps",
66
- save_steps=1000,
67
- save_total_limit=1,
68
- bf16=True,
69
- report_to=["tensorboard"],
70
- dataloader_pin_memory=False
71
- )
72
-
73
- # Custom collate function
74
- def collate_fn(examples):
75
- texts = [example["question"] for example in examples]
76
- labels = [example['answer'] for example in examples]
77
- images = [Image.open(example['image']).convert("RGB") for example in examples]
78
- tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
79
- tokens = tokens.to(torch.bfloat16).to(device)
80
- return tokens
81
-
82
- trainer = Trainer(
83
- model=model,
84
- train_dataset=train_ds,
85
- eval_dataset=val_ds,
86
- data_collator=collate_fn,
87
- args=args
88
- )
89
-
90
- trainer.train()
91
-
92
- if __name__ == "__main__":
93
- main()