rtdetr_v2_r50vd / config.json
danelcsb's picture
Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py
79e2a4b verified
{
"activation_dropout": 0.0,
"activation_function": "silu",
"anchor_image_size": null,
"architectures": [
"RTDetrV2ForObjectDetection"
],
"attention_dropout": 0.0,
"auxiliary_loss": true,
"backbone": null,
"backbone_config": {
"model_type": "rt_detr_v2_resnet",
"out_features": [
"stage2",
"stage3",
"stage4"
],
"out_indices": [
2,
3,
4
]
},
"backbone_kwargs": null,
"batch_norm_eps": 1e-05,
"box_noise_scale": 1.0,
"d_model": 256,
"decoder_activation_function": "relu",
"decoder_attention_heads": 8,
"decoder_ffn_dim": 1024,
"decoder_in_channels": [
256,
256,
256
],
"decoder_layers": 6,
"decoder_n_levels": 3,
"decoder_n_points": 4,
"decoder_offset_scale": 0.5,
"disable_custom_kernels": true,
"dropout": 0.0,
"encode_proj_layers": [
2
],
"encoder_activation_function": "gelu",
"encoder_attention_heads": 8,
"encoder_ffn_dim": 1024,
"encoder_hidden_dim": 256,
"encoder_in_channels": [
512,
1024,
2048
],
"encoder_layers": 1,
"eos_coefficient": 0.0001,
"eval_size": null,
"feat_strides": [
8,
16,
32
],
"focal_loss_alpha": 0.75,
"focal_loss_gamma": 2.0,
"freeze_backbone_batch_norms": true,
"hidden_expansion": 1.0,
"id2label": {
"0": "person",
"1": "bicycle",
"2": "car",
"3": "motorbike",
"4": "aeroplane",
"5": "bus",
"6": "train",
"7": "truck",
"8": "boat",
"9": "traffic light",
"10": "fire hydrant",
"11": "stop sign",
"12": "parking meter",
"13": "bench",
"14": "bird",
"15": "cat",
"16": "dog",
"17": "horse",
"18": "sheep",
"19": "cow",
"20": "elephant",
"21": "bear",
"22": "zebra",
"23": "giraffe",
"24": "backpack",
"25": "umbrella",
"26": "handbag",
"27": "tie",
"28": "suitcase",
"29": "frisbee",
"30": "skis",
"31": "snowboard",
"32": "sports ball",
"33": "kite",
"34": "baseball bat",
"35": "baseball glove",
"36": "skateboard",
"37": "surfboard",
"38": "tennis racket",
"39": "bottle",
"40": "wine glass",
"41": "cup",
"42": "fork",
"43": "knife",
"44": "spoon",
"45": "bowl",
"46": "banana",
"47": "apple",
"48": "sandwich",
"49": "orange",
"50": "broccoli",
"51": "carrot",
"52": "hot dog",
"53": "pizza",
"54": "donut",
"55": "cake",
"56": "chair",
"57": "sofa",
"58": "pottedplant",
"59": "bed",
"60": "diningtable",
"61": "toilet",
"62": "tvmonitor",
"63": "laptop",
"64": "mouse",
"65": "remote",
"66": "keyboard",
"67": "cell phone",
"68": "microwave",
"69": "oven",
"70": "toaster",
"71": "sink",
"72": "refrigerator",
"73": "book",
"74": "clock",
"75": "vase",
"76": "scissors",
"77": "teddy bear",
"78": "hair drier",
"79": "toothbrush"
},
"initializer_bias_prior_prob": null,
"initializer_range": 0.01,
"is_encoder_decoder": true,
"label2id": {
"aeroplane": 4,
"apple": 47,
"backpack": 24,
"banana": 46,
"baseball bat": 34,
"baseball glove": 35,
"bear": 21,
"bed": 59,
"bench": 13,
"bicycle": 1,
"bird": 14,
"boat": 8,
"book": 73,
"bottle": 39,
"bowl": 45,
"broccoli": 50,
"bus": 5,
"cake": 55,
"car": 2,
"carrot": 51,
"cat": 15,
"cell phone": 67,
"chair": 56,
"clock": 74,
"cow": 19,
"cup": 41,
"diningtable": 60,
"dog": 16,
"donut": 54,
"elephant": 20,
"fire hydrant": 10,
"fork": 42,
"frisbee": 29,
"giraffe": 23,
"hair drier": 78,
"handbag": 26,
"horse": 17,
"hot dog": 52,
"keyboard": 66,
"kite": 33,
"knife": 43,
"laptop": 63,
"microwave": 68,
"motorbike": 3,
"mouse": 64,
"orange": 49,
"oven": 69,
"parking meter": 12,
"person": 0,
"pizza": 53,
"pottedplant": 58,
"refrigerator": 72,
"remote": 65,
"sandwich": 48,
"scissors": 76,
"sheep": 18,
"sink": 71,
"skateboard": 36,
"skis": 30,
"snowboard": 31,
"sofa": 57,
"spoon": 44,
"sports ball": 32,
"stop sign": 11,
"suitcase": 28,
"surfboard": 37,
"teddy bear": 77,
"tennis racket": 38,
"tie": 27,
"toaster": 70,
"toilet": 61,
"toothbrush": 79,
"traffic light": 9,
"train": 6,
"truck": 7,
"tvmonitor": 62,
"umbrella": 25,
"vase": 75,
"wine glass": 40,
"zebra": 22
},
"label_noise_ratio": 0.5,
"layer_norm_eps": 1e-05,
"learn_initial_query": false,
"matcher_alpha": 0.25,
"matcher_bbox_cost": 5.0,
"matcher_class_cost": 2.0,
"matcher_gamma": 2.0,
"matcher_giou_cost": 2.0,
"model_type": "rt_detr_v2",
"normalize_before": false,
"num_denoising": 100,
"num_feature_levels": 3,
"num_queries": 300,
"positional_encoding_temperature": 10000,
"torch_dtype": "float32",
"transformers_version": "4.45.0.dev0",
"use_focal_loss": true,
"use_pretrained_backbone": false,
"use_timm_backbone": false,
"weight_loss_bbox": 5.0,
"weight_loss_giou": 2.0,
"weight_loss_vfl": 1.0,
"with_box_refine": true
}