{ "act_layer": "SiLU", "decoder_depth": 24, "dim": 2048, "domains_in": [ "caption", "t5_caption", "det", "metadata", "human_poses", "color_palette", "sam_instance", "rgb@224", "tok_rgb@224", "tok_normal@224", "tok_depth@224", "tok_semseg@224", "tok_clip@224", "tok_dinov2@224", "tok_dinov2_global", "tok_imagebind@224", "tok_imagebind_global", "tok_sam_edge@224", "tok_canny_edge@224" ], "domains_out": [ "caption", "t5_caption", "det", "metadata", "human_poses", "color_palette", "sam_instance", "tok_rgb@224", "tok_normal@224", "tok_depth@224", "tok_semseg@224", "tok_clip@224", "tok_dinov2@224", "tok_dinov2_global", "tok_imagebind@224", "tok_imagebind_global", "tok_sam_edge@224", "tok_canny_edge@224" ], "encoder_depth": 24, "gated_mlp": true, "image_size": 224, "mlp_bias": false, "mlp_ratio": 4, "norm_bias": false, "num_heads": 32, "patch_size": 16, "proj_bias": false, "qkv_bias": false, "share_modality_embeddings": false }