imdigitalashish commited on
Commit
a041f5a
1 Parent(s): b02fc9c

Upload 16 files

Browse files
Kandinsky-2-0-inpainting.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bf6d53435b913633a68db0d0e4e9f752129b67d8b0113232a31b1377449bfb
3
+ size 2672937865
Kandinsky-2-0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40552b65e26bb9283e027cb007077d3a9de1cc12aaadcf9cc98ae9f2ae33264f
3
+ size 2672894857
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - Kandinsky
5
+ - text-image
6
+ - text2image
7
+ - diffusion
8
+ - latent diffusion
9
+ - mCLIP-XLMR
10
+ - mT5
11
+ ---
12
+
13
+ # Kandinsky 2.0
14
+
15
+
16
+ Kandinsky 2.0 — the first multilingual text2image model.
17
+
18
+ [Open In Colab](https://colab.research.google.com/drive/1uPg9KwGZ2hJBl9taGA_3kyKGw12Rh3ij?usp=sharing)
19
+
20
+ [GitHub repository](https://github.com/ai-forever/Kandinsky-2.0)
21
+
22
+ [Habr post](https://habr.com/ru/company/sberbank/blog/701162/)
23
+
24
+ [Demo](https://rudalle.ru/)
25
+
26
+ **UNet size: 1.2B parameters**
27
+
28
+ ![NatallE.png](https://s3.amazonaws.com/moonup/production/uploads/1669132577749-5f91b1208a61a359f44e1851.png)
29
+
30
+ It is a latent diffusion model with two multi-lingual text encoders:
31
+ * mCLIP-XLMR (560M parameters)
32
+ * mT5-encoder-small (146M parameters)
33
+
34
+
35
+ These encoders and multilingual training datasets unveil the real multilingual text2image generation experience!
36
+
37
+ ![header.png](https://s3.amazonaws.com/moonup/production/uploads/1669132825912-5f91b1208a61a359f44e1851.png)
38
+
39
+ # How to use
40
+
41
+ ```python
42
+ pip install "git+https://github.com/ai-forever/Kandinsky-2.0.git"
43
+
44
+ from kandinsky2 import get_kandinsky2
45
+ model = get_kandinsky2('cuda', task_type='text2img')
46
+ images = model.generate_text2img('кошка в космосе', batch_size=4, h=512, w=512, num_steps=75, denoised_type='dynamic_threshold', dynamic_threshold_v=99.5, sampler='ddim_sampler', ddim_eta=0.01, guidance_scale=10)
47
+ ```
48
+
49
+ # Authors
50
+
51
+ + Arseniy Shakhmatov: [Github](https://github.com/cene555), [Blog](https://t.me/gradientdip)
52
+ + Anton Razzhigaev: [Github](https://github.com/razzant), [Blog](https://t.me/abstractDL)
53
+ + Aleksandr Nikolich: [Github](https://github.com/AlexWortega), [Blog](https://t.me/lovedeathtransformers)
54
+ + Vladimir Arkhipkin: [Github](https://github.com/oriBetelgeuse)
55
+ + Igor Pavlov: [Github](https://github.com/boomb0om)
56
+ + Andrey Kuznetsov: [Github](https://github.com/kuznetsoffandrey)
57
+ + Denis Dimitrov: [Github](https://github.com/denndimitrov)
58
+
59
+
60
+
text_encoder1/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4096,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 514,
15
+ "model_type": "xlm-roberta",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "output_past": true,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.17.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 250002
25
+ }
text_encoder1/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7124266439dd6bce544c23b57249a2ca764dbcd38e8eab16ce272c28c27b049
3
+ size 2242347565
text_encoder1/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
text_encoder1/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
text_encoder1/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
text_encoder1/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "xlm-roberta-large", "tokenizer_class": "XLMRobertaTokenizer"}
text_encoder2/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MT5ForConditionalGeneration"
4
+ ],
5
+ "d_ff": 1024,
6
+ "d_kv": 64,
7
+ "d_model": 512,
8
+ "decoder_start_token_id": 0,
9
+ "dropout_rate": 0.1,
10
+ "eos_token_id": 1,
11
+ "feed_forward_proj": "gated-gelu",
12
+ "initializer_factor": 1.0,
13
+ "is_encoder_decoder": true,
14
+ "layer_norm_epsilon": 1e-06,
15
+ "model_type": "mt5",
16
+ "num_decoder_layers": 8,
17
+ "num_heads": 6,
18
+ "num_layers": 8,
19
+ "pad_token_id": 0,
20
+ "relative_attention_num_buckets": 32,
21
+ "tie_word_embeddings": false,
22
+ "tokenizer_class": "T5Tokenizer",
23
+ "vocab_size": 250112
24
+ }
text_encoder2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9daa76f0231b96162833ebfee26e886a781b72e0d95ad1a1826b9147a74a939a
3
+ size 1200794589
text_encoder2/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": []}
text_encoder2/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
text_encoder2/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0}
vae.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9890a9697c53045ad52965b33aadc3429d43a644c9af4c01ede3d551f3adf0a
3
+ size 1096193273