shi-labs
/

oneformer_cityscapes_dinat_large

Image Segmentation

Inference Endpoints

Model card Files Files and versions Community

oneformer_cityscapes_dinat_large / config.json

praeclarumjj3's picture

Add model

695c75c almost 2 years ago

3.42 kB

	{
	"architectures": [
	"OneFormerForUniversalSegmentation"
	],
	"backbone_config": {
	"attention_probs_dropout_prob": 0.0,
	"depths": [
	3,
	4,
	18,
	5
	],
	"dilations": [
	[
	1,
	18,
	1
	],
	[
	1,
	5,
	1,
	9
	],
	[
	1,
	2,
	1,
	3,
	1,
	4,
	1,
	2,
	1,
	3,
	1,
	4,
	1,
	2,
	1,
	3,
	1,
	4
	],
	[
	1,
	2,
	1,
	2,
	1
	]
	],
	"drop_path_rate": 0.3,
	"embed_dim": 192,
	"encoder_stride": 32,
	"feature_channels": [
	192,
	384,
	768,
	1536
	],
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0,
	"kernel_size": 7,
	"layer_scale_init_value": 0.0,
	"mlp_ratio": 2.0,
	"num_channels": 3,
	"num_heads": [
	6,
	12,
	24,
	48
	],
	"patch_size": 4,
	"qkv_bias": true,
	"strides": [
	4,
	8,
	16,
	32
	]
	},
	"decoder_config": {
	"common_stride": 4,
	"conv_dim": 256,
	"decoder_layers": 10,
	"dim_feedforward": 2048,
	"dropout": 0.1,
	"encoder_feedforward_dim": 1024,
	"encoder_layers": 6,
	"enforce_input_proj": false,
	"hidden_dim": 256,
	"mask_dim": 256,
	"norm": "GN",
	"num_heads": 8,
	"pre_norm": false,
	"query_dec_layers": 2,
	"use_task_norm": true
	},
	"general_config": {
	"backbone_type": "dinat",
	"class_weight": 2.0,
	"contrastive_temperature": 0.07,
	"contrastive_weight": 0.5,
	"deep_supervision": true,
	"dice_weight": 5.0,
	"ignore_value": 255,
	"importance_sample_ratio": 0.75,
	"init_std": 0.02,
	"init_xavier_std": 1.0,
	"is_train": false,
	"layer_norm_eps": 1e-05,
	"mask_weight": 5.0,
	"no_object_weight": 0.1,
	"num_classes": 19,
	"num_queries": 250,
	"output_auxiliary_logits": true,
	"oversample_ratio": 3.0,
	"train_num_points": 12544,
	"use_auxiliary_loss": true
	},
	"hidden_size": 256,
	"id2label": {
	"0": "road",
	"1": "sidewalk",
	"2": "building",
	"3": "wall",
	"4": "fence",
	"5": "pole",
	"6": "traffic light",
	"7": "traffic sign",
	"8": "vegetation",
	"9": "terrain",
	"10": "sky",
	"11": "person",
	"12": "rider",
	"13": "car",
	"14": "truck",
	"15": "bus",
	"16": "train",
	"17": "motorcycle",
	"18": "bicycle"
	},
	"init_std": 0.02,
	"init_xavier_std": 1.0,
	"label2id": {
	"bicycle": 18,
	"building": 2,
	"bus": 15,
	"car": 13,
	"fence": 4,
	"motorcycle": 17,
	"person": 11,
	"pole": 5,
	"rider": 12,
	"road": 0,
	"sidewalk": 1,
	"sky": 10,
	"terrain": 9,
	"traffic light": 6,
	"traffic sign": 7,
	"train": 16,
	"truck": 14,
	"vegetation": 8,
	"wall": 3
	},
	"model_type": "oneformer",
	"num_attention_heads": 8,
	"num_hidden_layers": 10,
	"output_attentions": true,
	"output_hidden_states": true,
	"text_encoder_config": {
	"max_seq_len": 77,
	"task_seq_len": 77,
	"text_encoder_context_length": 77,
	"text_encoder_n_ctx": 16,
	"text_encoder_num_layers": 6,
	"text_encoder_proj_layers": 2,
	"text_encoder_vocab_size": 49408,
	"text_encoder_width": 256
	},
	"torch_dtype": "float32",
	"transformers_version": "4.25.0.dev0"
	}