Update README.md
Browse files
README.md
CHANGED
@@ -6,6 +6,107 @@ library_name: diffusers
|
|
6 |
|
7 |
<!-- Provide a quick summary of what the model is/does. -->
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
## Model Details
|
|
|
6 |
|
7 |
<!-- Provide a quick summary of what the model is/does. -->
|
8 |
|
9 |
+
Script for creating dummy random model:
|
10 |
+
|
11 |
+
|
12 |
+
```python
|
13 |
+
import torch
|
14 |
+
from diffusers import HunyuanVideoTransformer3DModel, AutoencoderKLHunyuanVideo, FlowMatchEulerDiscreteScheduler, HunyuanVideoPipeline
|
15 |
+
from transformers import LlamaModel, LlamaTokenizerFast, CLIPTextModel, CLIPTokenizer, LlamaConfig, CLIPTextConfig
|
16 |
+
|
17 |
+
torch.manual_seed(0)
|
18 |
+
transformer = HunyuanVideoTransformer3DModel(
|
19 |
+
in_channels=4,
|
20 |
+
out_channels=4,
|
21 |
+
num_attention_heads=2,
|
22 |
+
attention_head_dim=10,
|
23 |
+
num_layers=1,
|
24 |
+
num_single_layers=1,
|
25 |
+
num_refiner_layers=1,
|
26 |
+
patch_size=1,
|
27 |
+
patch_size_t=1,
|
28 |
+
guidance_embeds=True,
|
29 |
+
text_embed_dim=16,
|
30 |
+
pooled_projection_dim=8,
|
31 |
+
rope_axes_dim=(2, 4, 4),
|
32 |
+
)
|
33 |
+
|
34 |
+
torch.manual_seed(0)
|
35 |
+
vae = AutoencoderKLHunyuanVideo(
|
36 |
+
in_channels=3,
|
37 |
+
out_channels=3,
|
38 |
+
latent_channels=4,
|
39 |
+
down_block_types=(
|
40 |
+
"HunyuanVideoDownBlock3D",
|
41 |
+
"HunyuanVideoDownBlock3D",
|
42 |
+
"HunyuanVideoDownBlock3D",
|
43 |
+
"HunyuanVideoDownBlock3D",
|
44 |
+
),
|
45 |
+
up_block_types=(
|
46 |
+
"HunyuanVideoUpBlock3D",
|
47 |
+
"HunyuanVideoUpBlock3D",
|
48 |
+
"HunyuanVideoUpBlock3D",
|
49 |
+
"HunyuanVideoUpBlock3D",
|
50 |
+
),
|
51 |
+
block_out_channels=(8, 8, 8, 8),
|
52 |
+
layers_per_block=1,
|
53 |
+
act_fn="silu",
|
54 |
+
norm_num_groups=4,
|
55 |
+
scaling_factor=0.476986,
|
56 |
+
spatial_compression_ratio=8,
|
57 |
+
temporal_compression_ratio=4,
|
58 |
+
mid_block_add_attention=True,
|
59 |
+
)
|
60 |
+
|
61 |
+
torch.manual_seed(0)
|
62 |
+
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
63 |
+
|
64 |
+
llama_text_encoder_config = LlamaConfig(
|
65 |
+
bos_token_id=0,
|
66 |
+
eos_token_id=2,
|
67 |
+
hidden_size=16,
|
68 |
+
intermediate_size=37,
|
69 |
+
layer_norm_eps=1e-05,
|
70 |
+
num_attention_heads=4,
|
71 |
+
num_hidden_layers=2,
|
72 |
+
pad_token_id=1,
|
73 |
+
vocab_size=1000,
|
74 |
+
hidden_act="gelu",
|
75 |
+
projection_dim=32,
|
76 |
+
)
|
77 |
+
clip_text_encoder_config = CLIPTextConfig(
|
78 |
+
bos_token_id=0,
|
79 |
+
eos_token_id=2,
|
80 |
+
hidden_size=8,
|
81 |
+
intermediate_size=37,
|
82 |
+
layer_norm_eps=1e-05,
|
83 |
+
num_attention_heads=4,
|
84 |
+
num_hidden_layers=2,
|
85 |
+
pad_token_id=1,
|
86 |
+
vocab_size=1000,
|
87 |
+
hidden_act="gelu",
|
88 |
+
projection_dim=32,
|
89 |
+
)
|
90 |
+
|
91 |
+
text_encoder = LlamaModel(llama_text_encoder_config)
|
92 |
+
tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
93 |
+
|
94 |
+
torch.manual_seed(0)
|
95 |
+
text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
|
96 |
+
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
97 |
+
|
98 |
+
pipe = HunyuanVideoPipeline(
|
99 |
+
transformer=transformer,
|
100 |
+
text_encoder=text_encoder,
|
101 |
+
tokenizer=tokenizer,
|
102 |
+
text_encoder_2=text_encoder_2,
|
103 |
+
tokenizer_2=tokenizer_2,
|
104 |
+
vae=vae,
|
105 |
+
scheduler=scheduler,
|
106 |
+
)
|
107 |
+
|
108 |
+
pipe.push_to_hub("hf-internal-testing/tiny-random-hunyuanvideo")
|
109 |
+
```
|
110 |
|
111 |
|
112 |
## Model Details
|