README.md CHANGED
@@ -78,54 +78,38 @@ pipeline_tag: text-to-video
78
 
79
  # Quick start
80
 
81
- 1. Install the necessary requirements.
 
 
82
 
83
- - Ensure Python >= 3.10, PyTorch >= 2.4, CUDA >= 12.4.
84
 
85
- - It is recommended to use Anaconda to create a new environment (Python >= 3.10) `conda create -n rllegro python=3.10 -y` to run the following example.
86
-
87
- - run `pip install git+https://github.com/huggingface/diffusers.git torch==2.4.1 transformers==4.40.1 accelerate sentencepiece imageio imageio-ffmpeg beautifulsoup4`
88
 
89
- 2. Run inference.
 
90
  ```python
91
- import torch
92
- from diffusers import AutoencoderKLAllegro, AllegroPipeline
93
- from diffusers.utils import export_to_video
94
- vae = AutoencoderKLAllegro.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32)
95
- pipe = AllegroPipeline.from_pretrained(
96
- "rhymes-ai/Allegro", vae=vae, torch_dtype=torch.bfloat16
97
- )
98
- pipe.to("cuda")
99
- pipe.vae.enable_tiling()
100
- prompt = "A seaside harbor with bright sunlight and sparkling seawater, with many boats in the water. From an aerial view, the boats vary in size and color, some moving and some stationary. Fishing boats in the water suggest that this location might be a popular spot for docking fishing boats."
101
-
102
- positive_prompt = """
103
- (masterpiece), (best quality), (ultra-detailed), (unwatermarked),
104
- {}
105
- emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo,
106
- sharp focus, high budget, cinemascope, moody, epic, gorgeous
107
- """
108
-
109
- negative_prompt = """
110
- nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality,
111
- low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
112
- """
113
-
114
- prompt = prompt.format(prompt.lower().strip())
115
-
116
- video = pipe(prompt, negative_prompt=negative_prompt, guidance_scale=7.5, max_sequence_length=512, num_inference_steps=100, generator = torch.Generator(device="cuda:0").manual_seed(42)).frames[0]
117
- export_to_video(video, "output.mp4", fps=15)
118
  ```
119
-
120
- Use `pipe.enable_sequential_cpu_offload()` to offload the model into CPU for less GPU memory cost (about 9.3G, compared to 27.5G if CPU offload is not enabled), but the inference time will increase significantly.
121
 
122
- 3. (Optional) Interpolate the video to 30 FPS.
123
 
124
  It is recommended to use [EMA-VFI](https://github.com/MCG-NJU/EMA-VFI) to interpolate the video from 15 FPS to 30 FPS.
125
 
126
  For better visual quality, please use imageio to save the video.
127
 
128
- 4. For faster inference such Context Parallel, PAB, please refer to our [github repo](https://github.com/rhymes-ai/Allegro).
129
-
130
  # License
131
  This repo is released under the Apache 2.0 License.
 
78
 
79
  # Quick start
80
 
81
+ 1. Download the [Allegro GitHub code](https://github.com/rhymes-ai/Allegro).
82
+
83
+ 2. Install the necessary requirements.
84
 
85
+ - Ensure Python >= 3.10, PyTorch >= 2.4, CUDA >= 12.4. For details, see [requirements.txt](https://github.com/rhymes-ai/Allegro/blob/main/requirements.txt).
86
 
87
+ - It is recommended to use Anaconda to create a new environment (Python >= 3.10) to run the following example.
88
+
89
+ 3. Download the [Allegro model weights](https://huggingface.co/rhymes-ai/Allegro). Before diffuser integration, use git lfs or snapshot_download.
90
 
91
+ 4. Run inference.
92
+
93
  ```python
94
+ python single_inference.py \
95
+ --user_prompt 'A seaside harbor with bright sunlight and sparkling seawater, with many boats in the water. From an aerial view, the boats vary in size and color, some moving and some stationary. Fishing boats in the water suggest that this location might be a popular spot for docking fishing boats.' \
96
+ --save_path ./output_videos/test_video.mp4
97
+ --vae your/path/to/vae \
98
+ --dit your/path/to/transformer \
99
+ --text_encoder your/path/to/text_encoder \
100
+ --tokenizer your/path/to/tokenizer \
101
+ --guidance_scale 7.5 \
102
+ --num_sampling_steps 100 \
103
+ --seed 42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  ```
105
+
106
+ Use '--enable_cpu_offload' to offload the model into CPU for less GPU memory cost (about 9.3G, compared to 27.5G if CPU offload is not enabled), but the inference time will increase significantly.
107
 
108
+ 5. (Optional) Interpolate the video to 30 FPS.
109
 
110
  It is recommended to use [EMA-VFI](https://github.com/MCG-NJU/EMA-VFI) to interpolate the video from 15 FPS to 30 FPS.
111
 
112
  For better visual quality, please use imageio to save the video.
113
 
 
 
114
  # License
115
  This repo is released under the Apache 2.0 License.
model_index.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "_class_name": "AllegroPipeline",
3
- "_diffusers_version": "0.31.0.dev0",
4
- "scheduler": [
5
- "diffusers",
6
- "EulerAncestralDiscreteScheduler"
7
- ],
8
- "text_encoder": [
9
- "transformers",
10
- "T5EncoderModel"
11
- ],
12
- "tokenizer": [
13
- "transformers",
14
- "T5Tokenizer"
15
- ],
16
- "transformer": [
17
- "diffusers",
18
- "AllegroTransformer3DModel"
19
- ],
20
- "vae": [
21
- "diffusers",
22
- "AutoencoderKLAllegro"
23
- ]
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text_encoder/model-00001-of-00004.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a68b2c8c080696a10109612a649bc69330991ecfea65930ccfdfbdb011f2686
3
- size 4989319680
 
 
 
 
text_encoder/model-00002-of-00004.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8ed6556d7507e38af5b428c605fb2a6f2bdb7e80bd481308b865f7a40c551ca
3
- size 4999830656
 
 
 
 
text_encoder/model-00003-of-00004.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c831635f83041f83faf0024b39c6ecb21b45d70dd38a63ea5bac6c7c6e5e558c
3
- size 4865612720
 
 
 
 
text_encoder/model-00004-of-00004.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:02a5f2d69205be92ad48fe5d712d38c2ff55627969116aeffc58bd75a28da468
3
- size 4194506688
 
 
 
 
text_encoder/model.safetensors.index.json DELETED
@@ -1,226 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 19049242624
4
- },
5
- "weight_map": {
6
- "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00004.safetensors",
7
- "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00004.safetensors",
8
- "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00004.safetensors",
9
- "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00004.safetensors",
10
- "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00004.safetensors",
11
- "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00004.safetensors",
12
- "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00004.safetensors",
13
- "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00004.safetensors",
14
- "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00004.safetensors",
15
- "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00004.safetensors",
16
- "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00004.safetensors",
17
- "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00004.safetensors",
18
- "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00004.safetensors",
19
- "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00004.safetensors",
20
- "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00004.safetensors",
21
- "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00004.safetensors",
22
- "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00004.safetensors",
23
- "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00004.safetensors",
24
- "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00004.safetensors",
25
- "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
26
- "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00002-of-00004.safetensors",
27
- "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
28
- "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
29
- "encoder.block.10.layer.0.layer_norm.weight": "model-00002-of-00004.safetensors",
30
- "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00004.safetensors",
31
- "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00004.safetensors",
32
- "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
33
- "encoder.block.10.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
34
- "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
35
- "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00002-of-00004.safetensors",
36
- "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
37
- "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
38
- "encoder.block.11.layer.0.layer_norm.weight": "model-00002-of-00004.safetensors",
39
- "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00004.safetensors",
40
- "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00004.safetensors",
41
- "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
42
- "encoder.block.11.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
43
- "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
44
- "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
45
- "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
46
- "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
47
- "encoder.block.12.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
48
- "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
49
- "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00004.safetensors",
50
- "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00003-of-00004.safetensors",
51
- "encoder.block.12.layer.1.layer_norm.weight": "model-00003-of-00004.safetensors",
52
- "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00003-of-00004.safetensors",
53
- "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
54
- "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00003-of-00004.safetensors",
55
- "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00003-of-00004.safetensors",
56
- "encoder.block.13.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
57
- "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
58
- "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00004.safetensors",
59
- "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00003-of-00004.safetensors",
60
- "encoder.block.13.layer.1.layer_norm.weight": "model-00003-of-00004.safetensors",
61
- "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00003-of-00004.safetensors",
62
- "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
63
- "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00003-of-00004.safetensors",
64
- "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00003-of-00004.safetensors",
65
- "encoder.block.14.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
66
- "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
67
- "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00004.safetensors",
68
- "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00003-of-00004.safetensors",
69
- "encoder.block.14.layer.1.layer_norm.weight": "model-00003-of-00004.safetensors",
70
- "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00003-of-00004.safetensors",
71
- "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
72
- "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00003-of-00004.safetensors",
73
- "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00003-of-00004.safetensors",
74
- "encoder.block.15.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
75
- "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
76
- "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00004.safetensors",
77
- "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00003-of-00004.safetensors",
78
- "encoder.block.15.layer.1.layer_norm.weight": "model-00003-of-00004.safetensors",
79
- "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00003-of-00004.safetensors",
80
- "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
81
- "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00003-of-00004.safetensors",
82
- "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00003-of-00004.safetensors",
83
- "encoder.block.16.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
84
- "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
85
- "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00004.safetensors",
86
- "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00003-of-00004.safetensors",
87
- "encoder.block.16.layer.1.layer_norm.weight": "model-00003-of-00004.safetensors",
88
- "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00003-of-00004.safetensors",
89
- "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
90
- "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00003-of-00004.safetensors",
91
- "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00003-of-00004.safetensors",
92
- "encoder.block.17.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
93
- "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
94
- "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00004.safetensors",
95
- "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00003-of-00004.safetensors",
96
- "encoder.block.17.layer.1.layer_norm.weight": "model-00003-of-00004.safetensors",
97
- "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00003-of-00004.safetensors",
98
- "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00003-of-00004.safetensors",
99
- "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00003-of-00004.safetensors",
100
- "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00003-of-00004.safetensors",
101
- "encoder.block.18.layer.0.layer_norm.weight": "model-00003-of-00004.safetensors",
102
- "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00004.safetensors",
103
- "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00004-of-00004.safetensors",
104
- "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00004-of-00004.safetensors",
105
- "encoder.block.18.layer.1.layer_norm.weight": "model-00004-of-00004.safetensors",
106
- "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00004-of-00004.safetensors",
107
- "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00004-of-00004.safetensors",
108
- "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00004-of-00004.safetensors",
109
- "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00004-of-00004.safetensors",
110
- "encoder.block.19.layer.0.layer_norm.weight": "model-00004-of-00004.safetensors",
111
- "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00004-of-00004.safetensors",
112
- "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00004-of-00004.safetensors",
113
- "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00004-of-00004.safetensors",
114
- "encoder.block.19.layer.1.layer_norm.weight": "model-00004-of-00004.safetensors",
115
- "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00004.safetensors",
116
- "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00004.safetensors",
117
- "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00004.safetensors",
118
- "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00004.safetensors",
119
- "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00004.safetensors",
120
- "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00004.safetensors",
121
- "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00004.safetensors",
122
- "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00004.safetensors",
123
- "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00004.safetensors",
124
- "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00004-of-00004.safetensors",
125
- "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00004-of-00004.safetensors",
126
- "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00004-of-00004.safetensors",
127
- "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00004-of-00004.safetensors",
128
- "encoder.block.20.layer.0.layer_norm.weight": "model-00004-of-00004.safetensors",
129
- "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00004-of-00004.safetensors",
130
- "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00004-of-00004.safetensors",
131
- "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00004-of-00004.safetensors",
132
- "encoder.block.20.layer.1.layer_norm.weight": "model-00004-of-00004.safetensors",
133
- "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00004-of-00004.safetensors",
134
- "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00004-of-00004.safetensors",
135
- "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00004-of-00004.safetensors",
136
- "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00004-of-00004.safetensors",
137
- "encoder.block.21.layer.0.layer_norm.weight": "model-00004-of-00004.safetensors",
138
- "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00004-of-00004.safetensors",
139
- "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00004-of-00004.safetensors",
140
- "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00004-of-00004.safetensors",
141
- "encoder.block.21.layer.1.layer_norm.weight": "model-00004-of-00004.safetensors",
142
- "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00004-of-00004.safetensors",
143
- "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00004-of-00004.safetensors",
144
- "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00004-of-00004.safetensors",
145
- "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00004-of-00004.safetensors",
146
- "encoder.block.22.layer.0.layer_norm.weight": "model-00004-of-00004.safetensors",
147
- "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00004-of-00004.safetensors",
148
- "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00004-of-00004.safetensors",
149
- "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00004-of-00004.safetensors",
150
- "encoder.block.22.layer.1.layer_norm.weight": "model-00004-of-00004.safetensors",
151
- "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00004-of-00004.safetensors",
152
- "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00004-of-00004.safetensors",
153
- "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00004-of-00004.safetensors",
154
- "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00004-of-00004.safetensors",
155
- "encoder.block.23.layer.0.layer_norm.weight": "model-00004-of-00004.safetensors",
156
- "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00004-of-00004.safetensors",
157
- "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00004-of-00004.safetensors",
158
- "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00004-of-00004.safetensors",
159
- "encoder.block.23.layer.1.layer_norm.weight": "model-00004-of-00004.safetensors",
160
- "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00004.safetensors",
161
- "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00004.safetensors",
162
- "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00004.safetensors",
163
- "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00004.safetensors",
164
- "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00004.safetensors",
165
- "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00004.safetensors",
166
- "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00004.safetensors",
167
- "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00004.safetensors",
168
- "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00004.safetensors",
169
- "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00004.safetensors",
170
- "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00004.safetensors",
171
- "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00004.safetensors",
172
- "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00004.safetensors",
173
- "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00004.safetensors",
174
- "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00004.safetensors",
175
- "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00004.safetensors",
176
- "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00004.safetensors",
177
- "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00004.safetensors",
178
- "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00004.safetensors",
179
- "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00004.safetensors",
180
- "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00004.safetensors",
181
- "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00004.safetensors",
182
- "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00004.safetensors",
183
- "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00004.safetensors",
184
- "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00004.safetensors",
185
- "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
186
- "encoder.block.5.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
187
- "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
188
- "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00002-of-00004.safetensors",
189
- "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
190
- "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
191
- "encoder.block.6.layer.0.layer_norm.weight": "model-00002-of-00004.safetensors",
192
- "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00004.safetensors",
193
- "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00004.safetensors",
194
- "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
195
- "encoder.block.6.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
196
- "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
197
- "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00002-of-00004.safetensors",
198
- "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
199
- "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
200
- "encoder.block.7.layer.0.layer_norm.weight": "model-00002-of-00004.safetensors",
201
- "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00004.safetensors",
202
- "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00004.safetensors",
203
- "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
204
- "encoder.block.7.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
205
- "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
206
- "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00002-of-00004.safetensors",
207
- "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
208
- "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
209
- "encoder.block.8.layer.0.layer_norm.weight": "model-00002-of-00004.safetensors",
210
- "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00004.safetensors",
211
- "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00004.safetensors",
212
- "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
213
- "encoder.block.8.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
214
- "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00002-of-00004.safetensors",
215
- "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00002-of-00004.safetensors",
216
- "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00002-of-00004.safetensors",
217
- "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00002-of-00004.safetensors",
218
- "encoder.block.9.layer.0.layer_norm.weight": "model-00002-of-00004.safetensors",
219
- "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00004.safetensors",
220
- "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00004.safetensors",
221
- "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00002-of-00004.safetensors",
222
- "encoder.block.9.layer.1.layer_norm.weight": "model-00002-of-00004.safetensors",
223
- "encoder.final_layer_norm.weight": "model-00004-of-00004.safetensors",
224
- "shared.weight": "model-00001-of-00004.safetensors"
225
- }
226
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
transformer/config.json CHANGED
@@ -1,30 +1,38 @@
1
  {
2
  "_class_name": "AllegroTransformer3DModel",
3
- "_diffusers_version": "0.31.0.dev0",
4
  "activation_fn": "gelu-approximate",
5
  "attention_bias": true,
6
  "attention_head_dim": 96,
 
7
  "caption_channels": 4096,
8
  "cross_attention_dim": 2304,
 
 
9
  "dropout": 0.0,
10
  "in_channels": 4,
11
  "interpolation_scale_h": 2.0,
12
  "interpolation_scale_t": 2.2,
13
  "interpolation_scale_w": 2.0,
 
14
  "norm_elementwise_affine": false,
15
  "norm_eps": 1e-06,
16
  "norm_type": "ada_norm_single",
17
  "num_attention_heads": 24,
 
18
  "num_layers": 32,
 
19
  "out_channels": 4,
20
  "patch_size": 2,
21
  "patch_size_t": 1,
22
- "sample_frames": 22,
23
- "sample_height": 90,
24
  "sample_size": [
25
  90,
26
  160
27
  ],
28
  "sample_size_t": 22,
29
- "sample_width": 160
 
 
 
30
  }
 
1
  {
2
  "_class_name": "AllegroTransformer3DModel",
3
+ "_diffusers_version": "0.28.0",
4
  "activation_fn": "gelu-approximate",
5
  "attention_bias": true,
6
  "attention_head_dim": 96,
7
+ "ca_attention_mode": "xformers",
8
  "caption_channels": 4096,
9
  "cross_attention_dim": 2304,
10
+ "double_self_attention": false,
11
+ "downsampler": null,
12
  "dropout": 0.0,
13
  "in_channels": 4,
14
  "interpolation_scale_h": 2.0,
15
  "interpolation_scale_t": 2.2,
16
  "interpolation_scale_w": 2.0,
17
+ "model_max_length": 300,
18
  "norm_elementwise_affine": false,
19
  "norm_eps": 1e-06,
20
  "norm_type": "ada_norm_single",
21
  "num_attention_heads": 24,
22
+ "num_embeds_ada_norm": 1000,
23
  "num_layers": 32,
24
+ "only_cross_attention": false,
25
  "out_channels": 4,
26
  "patch_size": 2,
27
  "patch_size_t": 1,
28
+ "sa_attention_mode": "flash",
 
29
  "sample_size": [
30
  90,
31
  160
32
  ],
33
  "sample_size_t": 22,
34
+ "upcast_attention": false,
35
+ "use_additional_conditions": null,
36
+ "use_linear_projection": false,
37
+ "use_rope": true
38
  }
vae/config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "_class_name": "AutoencoderKLAllegro",
3
- "_diffusers_version": "0.31.0.dev0",
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
@@ -8,37 +8,33 @@
8
  512,
9
  512
10
  ],
11
- "down_block_types": [
12
- "AllegroDownBlock3D",
13
- "AllegroDownBlock3D",
14
- "AllegroDownBlock3D",
15
- "AllegroDownBlock3D"
16
- ],
17
- "force_upcast": true,
18
- "in_channels": 3,
19
- "latent_channels": 4,
20
- "layers_per_block": 2,
21
- "norm_num_groups": 32,
22
- "out_channels": 3,
23
- "sample_size": 320,
24
- "scaling_factor": 0.13,
25
- "temporal_compression_ratio": 4,
26
- "temporal_downsample_blocks": [
27
  true,
28
  true,
29
  false,
30
  false
31
  ],
32
- "temporal_upsample_blocks": [
33
  false,
34
  true,
35
  true,
36
  false
37
  ],
38
- "up_block_types": [
39
- "AllegroUpBlock3D",
40
- "AllegroUpBlock3D",
41
- "AllegroUpBlock3D",
42
- "AllegroUpBlock3D"
43
- ]
 
 
 
 
 
 
 
 
 
 
 
44
  }
 
1
  {
2
+ "_class_name": "AllegroAutoencoderKL3D",
3
+ "_diffusers_version": "0.28.0",
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
 
8
  512,
9
  512
10
  ],
11
+ "blocks_tempdown_li": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  true,
13
  true,
14
  false,
15
  false
16
  ],
17
+ "blocks_tempup_li": [
18
  false,
19
  true,
20
  true,
21
  false
22
  ],
23
+ "chunk_len": 24,
24
+ "down_block_num": 4,
25
+ "force_upcast": true,
26
+ "in_channels": 3,
27
+ "latent_channels": 4,
28
+ "layers_per_block": 2,
29
+ "load_mode": "full",
30
+ "norm_num_groups": 32,
31
+ "out_channels": 3,
32
+ "sample_size": 320,
33
+ "scale_factor": 0.13,
34
+ "t_over": 8,
35
+ "tile_overlap": [
36
+ 120,
37
+ 80
38
+ ],
39
+ "up_block_num": 4
40
  }