{ "num_attention_heads": 16, "attention_head_dim": 72, "in_channels": 4, "cond_channels": 9, "out_channels": 8, "num_layers": 28, "dropout": 0.0, "norm_num_groups": 32, "cross_attention_dim": 1152, "attention_bias": true, "sample_size": 128, "patch_size": 2, "activation_fn": "gelu-approximate", "num_embeds_ada_norm": 1000, "upcast_attention": false, "norm_type": "ada_norm_single", "norm_elementwise_affine": false, "norm_eps": 1e-06, "caption_channels": 4096, "attention_type": "default" }