meow2018 commited on
Commit
1d5dd06
·
verified ·
1 Parent(s): 506d699

Upload yolo_world_s_pretrain_FG_v3.py

Browse files
configs/pretrain/yolo_world_s_pretrain_FG_v3.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deploy:
2
+ #python deploy/deploy.py /data/taofuyu/models/yolo_world/detection_onnxruntime_static.py /data/taofuyu/models/yolo_world/yolo_world_s_pretrain_FG_v2.py /data/taofuyu/snapshot/yolo_world/fg_pretrain_v2/epoch_1.pth /data/taofuyu/tao_dataset/FG/训练FG_无车顶车窗/192_168_1_123_2_2024-01-15_09-53-05_2024-01-15_09-54-20_0.jpg --test-img /data/taofuyu/tao_dataset/FG/test_wrong/现场问题/第171次车位引导【车头图】识别结果推送_车位2_1_闽A4YY27_None_picture_2023_11_29_18_6_55.jpg --work-dir /data/taofuyu/log/yolo_world/fg_pretrain_v2/
3
+ _base_ = (
4
+ '/data/taofuyu/repos/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py')
5
+ custom_imports = dict(
6
+ imports=['yolo_world'],
7
+ allow_failed_imports=False)
8
+
9
+ # hyper-parameters
10
+ num_classes = 5 #测试时的类别数
11
+ num_training_classes = 80 #训练时的类别数
12
+ max_epochs = 100 # Maximum training epochs
13
+ close_mosaic_epochs = 2
14
+ save_epoch_intervals = 1
15
+ text_channels = 512
16
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
17
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
18
+ base_lr = 2e-3 * 0.5
19
+ weight_decay = 0.05 / 2
20
+ train_batch_size_per_gpu = 32
21
+
22
+ # model settings
23
+ model = dict(
24
+ type='YOLOWorldDetector',
25
+ mm_neck=True,
26
+ num_train_classes=num_training_classes,
27
+ num_test_classes=num_classes,
28
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
29
+ backbone=dict(
30
+ _delete_=True,
31
+ type='MultiModalYOLOBackbone',
32
+ image_model={{_base_.model.backbone}},
33
+ text_model=dict(
34
+ type='HuggingCLIPLanguageBackbone',
35
+ model_name='openai/clip-vit-base-patch32',
36
+ frozen_modules=['all'])),
37
+ neck=dict(type='YOLOWorldPAFPN',
38
+ guide_channels=text_channels,
39
+ embed_channels=neck_embed_channels,
40
+ num_heads=neck_num_heads,
41
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv', use_einsum=False)),
42
+ bbox_head=dict(type='YOLOWorldHead',
43
+ head_module=dict(type='YOLOWorldHeadModule',
44
+ use_bn_head=True,
45
+ embed_dims=text_channels,
46
+ num_classes=num_training_classes,
47
+ use_einsum=False)),
48
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
49
+
50
+ # dataset settings
51
+ text_transform = [
52
+ dict(type='RandomLoadText',
53
+ num_neg_samples=(num_classes, num_classes),
54
+ max_num_samples=num_training_classes,
55
+ padding_to_max=True,
56
+ padding_value=''),
57
+ dict(type='mmdet.PackDetInputs',
58
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
59
+ 'flip_direction', 'texts'))
60
+ ]
61
+ train_pipeline = [
62
+ *_base_.pre_transform,
63
+ dict(type='MultiModalMosaic',
64
+ img_scale=_base_.img_scale,
65
+ pad_val=114.0,
66
+ pre_transform=_base_.pre_transform),
67
+ dict(
68
+ type='YOLOv5RandomAffine',
69
+ max_rotate_degree=0.0,
70
+ max_shear_degree=0.0,
71
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
72
+ max_aspect_ratio=_base_.max_aspect_ratio,
73
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
74
+ border_val=(114, 114, 114)),
75
+ *_base_.last_transform[:-1],
76
+ *text_transform,
77
+ ]
78
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
79
+
80
+
81
+ # part_obj365v1_train_dataset = dict(
82
+ # type='MultiModalDataset',
83
+ # dataset=dict(
84
+ # type='YOLOv5Objects365V1Dataset',
85
+ # data_root='/data/taofuyu/tao_dataset/yolo_world/objects365v1/',
86
+ # ann_file='annotations/objects365_train.json',
87
+ # data_prefix=dict(img='train/'),
88
+ # filter_cfg=dict(filter_empty_gt=False, min_size=32)),
89
+ # class_text_path='data/texts/obj365v1_class_texts.json',
90
+ # pipeline=train_pipeline)
91
+
92
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
93
+ data_root='/data/taofuyu/tao_dataset/yolo_world/mixed_grounding/',
94
+ ann_file='annotations/final_mixed_train_no_coco.json',
95
+ data_prefix=dict(img='images/'),
96
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
97
+ pipeline=train_pipeline)
98
+
99
+ flickr_train_dataset = dict(
100
+ type='YOLOv5MixedGroundingDataset',
101
+ data_root='/data/taofuyu/tao_dataset/yolo_world/flickr/',
102
+ ann_file='annotations/final_flickr_separateGT_train.json',
103
+ data_prefix=dict(img='flickr30k_images/'),
104
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
105
+ pipeline=train_pipeline)
106
+
107
+ fg_train_dataset = dict(
108
+ type='MultiModalDataset',
109
+ dataset=dict(
110
+ type='YOLOv5FGDataset',
111
+ data_root='',
112
+ ann_file='/data/taofuyu/tao_dataset/井盖检测/jinggai_few_shot_3.json',
113
+ data_prefix=dict(img=''),
114
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
115
+ class_text_path='/data/taofuyu/repos/YOLO-World/data/texts/fewshot_class_texts.json',
116
+ pipeline=train_pipeline)
117
+
118
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
119
+ collate_fn=dict(type='yolow_collate'),
120
+ dataset=dict(_delete_=True,
121
+ type='ConcatDataset',
122
+ datasets=[
123
+ fg_train_dataset,
124
+ flickr_train_dataset, mg_train_dataset
125
+ ],
126
+ ignore_keys=['classes', 'palette']))
127
+
128
+ test_pipeline = [
129
+ *_base_.test_pipeline[:-1],
130
+ dict(type='LoadText', text_path='/data/taofuyu/repos/YOLO-World/data/texts/fewshot_class_texts.json'),
131
+ dict(type='mmdet.PackDetInputs',
132
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
133
+ 'scale_factor', 'pad_param', 'texts'))
134
+ ]
135
+ coco_val_dataset = dict(
136
+ _delete_=True,
137
+ type='MultiModalDataset',
138
+ dataset=dict(
139
+ type='YOLOv5FGDataset',
140
+ data_root='',
141
+ ann_file='/data/taofuyu/tao_dataset/井盖检测/jinggai_few_shot_3.json',
142
+ data_prefix=dict(img=''),
143
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
144
+ class_text_path='/data/taofuyu/repos/YOLO-World/data/texts/fewshot_class_texts.json',
145
+ pipeline=test_pipeline)
146
+ val_dataloader = dict(dataset=coco_val_dataset)
147
+ test_dataloader = val_dataloader
148
+
149
+ val_evaluator = dict(
150
+ _delete_=True,
151
+ type='mmdet.CocoMetric',
152
+ proposal_nums=(100, 1, 10),
153
+ ann_file='/data/taofuyu/tao_dataset/井盖检测/jinggai_few_shot_3.json',
154
+ metric='bbox')
155
+ test_evaluator = val_evaluator
156
+
157
+ # training settings
158
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
159
+ checkpoint=dict(interval=save_epoch_intervals,
160
+ rule='greater'))
161
+ custom_hooks = [
162
+ dict(type='EMAHook',
163
+ ema_type='ExpMomentumEMA',
164
+ momentum=0.0001,
165
+ update_buffers=True,
166
+ strict_load=False,
167
+ priority=49),
168
+ dict(type='mmdet.PipelineSwitchHook',
169
+ switch_epoch=max_epochs - close_mosaic_epochs,
170
+ switch_pipeline=train_pipeline_stage2)
171
+ ]
172
+ train_cfg = dict(max_epochs=max_epochs,
173
+ val_interval=100,
174
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
175
+ _base_.val_interval_stage2)])
176
+ optim_wrapper = dict(optimizer=dict(
177
+ _delete_=True,
178
+ type='AdamW',
179
+ lr=base_lr,
180
+ weight_decay=weight_decay,
181
+ batch_size_per_gpu=train_batch_size_per_gpu),
182
+ paramwise_cfg=dict(bias_decay_mult=0.0,
183
+ norm_decay_mult=0.0,
184
+ custom_keys={
185
+ 'backbone.text_model':
186
+ dict(lr_mult=0.01),
187
+ 'logit_scale':
188
+ dict(weight_decay=0.0)
189
+ }),
190
+ constructor='YOLOWv5OptimizerConstructor')