dhuck commited on
Commit
aa012a8
1 Parent(s): 5c56dd4

Upload StableDiffusionCLAPPipeline

Browse files
audio_encoder/config.json CHANGED
@@ -1,49 +1,29 @@
1
  {
2
- "_name_or_path": "dhuck/stable-diffusion-clap/audio_encoder",
3
- "aff_block_r": 4,
4
  "architectures": [
5
- "ClapAudioModel"
6
  ],
7
- "attention_probs_dropout_prob": 0.0,
8
- "depths": [
9
- 2,
10
- 2,
11
- 12,
12
- 2
13
- ],
14
- "drop_path_rate": 0.0,
15
- "enable_fusion": false,
16
- "enable_patch_layer_norm": true,
17
- "flatten_patch_embeds": true,
18
- "fusion_type": null,
19
- "hidden_act": "gelu",
20
- "hidden_dropout_prob": 0.1,
21
- "hidden_size": 1024,
22
  "initializer_factor": 1.0,
23
- "layer_norm_eps": 1e-05,
24
- "mlp_ratio": 4.0,
25
- "model_type": "clap_audio_model",
26
- "num_attention_heads": [
27
- 4,
28
- 8,
29
- 16,
30
- 32
31
- ],
32
- "num_classes": 527,
33
- "num_hidden_layers": 4,
34
- "num_mel_bins": 64,
35
- "patch_embed_input_channels": 1,
36
- "patch_embeds_hidden_size": 128,
37
- "patch_size": 4,
38
- "patch_stride": [
39
- 4,
40
- 4
41
- ],
42
  "projection_dim": 512,
43
  "projection_hidden_act": "relu",
44
- "qkv_bias": true,
45
- "spec_size": 256,
 
46
  "torch_dtype": "float32",
47
- "transformers_version": "4.37.1",
48
- "window_size": 8
49
  }
 
1
  {
2
+ "_name_or_path": "laion/larger_clap_music",
 
3
  "architectures": [
4
+ "ClapModel"
5
  ],
6
+ "audio_config": {
7
+ "depths": [
8
+ 2,
9
+ 2,
10
+ 12,
11
+ 2
12
+ ],
13
+ "hidden_size": 1024,
14
+ "model_type": "clap_audio_model",
15
+ "patch_embeds_hidden_size": 128
16
+ },
17
+ "hidden_size": 768,
 
 
 
18
  "initializer_factor": 1.0,
19
+ "logit_scale_init_value": 14.285714285714285,
20
+ "model_type": "clap",
21
+ "num_hidden_layers": 16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "projection_dim": 512,
23
  "projection_hidden_act": "relu",
24
+ "text_config": {
25
+ "model_type": "clap_text_model"
26
+ },
27
  "torch_dtype": "float32",
28
+ "transformers_version": "4.37.1"
 
29
  }
audio_encoder/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0542c0ea071de5a84232c0394c42c9ffffc6dd422e0a364685ade6021172888d
3
- size 271931456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:013f1d40b8981b5dd741c0ccc444ae6cf8485d7cd4333892bcb2c6e8c3047064
3
+ size 776327440
model_index.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "_class_name": "StableDiffusionCLAPPipeline",
3
  "_diffusers_version": "0.25.0.dev0",
4
- "_name_or_path": "dhuck/stable-diffusion-clap",
5
  "audio_encoder": [
6
  "transformers",
7
- "ClapAudioModel"
8
  ],
9
  "audio_processor": [
10
  "transformers",
 
1
  {
2
  "_class_name": "StableDiffusionCLAPPipeline",
3
  "_diffusers_version": "0.25.0.dev0",
4
+ "_name_or_path": "stabilityai/stable-diffusion-2-1",
5
  "audio_encoder": [
6
  "transformers",
7
+ "ClapModel"
8
  ],
9
  "audio_processor": [
10
  "transformers",
text_encoder/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "dhuck/stable-diffusion-clap/text_encoder",
3
  "architectures": [
4
  "CLIPTextModel"
5
  ],
 
1
  {
2
+ "_name_or_path": "/home/dhuck/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/text_encoder",
3
  "architectures": [
4
  "CLIPTextModel"
5
  ],
tokenizer/special_tokens_map.json CHANGED
@@ -13,13 +13,7 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": {
17
- "content": "!",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "!",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<|endoftext|>",
19
  "lstrip": false,
unet/config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_class_name": "UNet2DConditionModel",
3
  "_diffusers_version": "0.25.0.dev0",
4
- "_name_or_path": "dhuck/stable-diffusion-clap/unet",
5
  "act_fn": "silu",
6
  "addition_embed_type": null,
7
  "addition_embed_type_num_heads": 64,
 
1
  {
2
  "_class_name": "UNet2DConditionModel",
3
  "_diffusers_version": "0.25.0.dev0",
4
+ "_name_or_path": "stabilityai/stable-diffusion-2-1",
5
  "act_fn": "silu",
6
  "addition_embed_type": null,
7
  "addition_embed_type_num_heads": 64,
unet/diffusion_pytorch_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90c904a46af0e172d59ee379709f77cf2e53012706c4935bd73e4593d672132e
3
  size 3509909336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb59fe25f5428e6cf3a4d522a98ab977e72e85fca5863f04230c0bf6f288aef
3
  size 3509909336
vae/config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_class_name": "AutoencoderKL",
3
  "_diffusers_version": "0.25.0.dev0",
4
- "_name_or_path": "dhuck/stable-diffusion-clap/vae",
5
  "act_fn": "silu",
6
  "block_out_channels": [
7
  128,
 
1
  {
2
  "_class_name": "AutoencoderKL",
3
  "_diffusers_version": "0.25.0.dev0",
4
+ "_name_or_path": "/home/dhuck/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6/vae",
5
  "act_fn": "silu",
6
  "block_out_channels": [
7
  128,