eranlevinlt commited on
Commit
f1a05f0
1 Parent(s): af41e07
Files changed (1) hide show
  1. app.py +266 -0
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import torch
4
+ from huggingface_hub import snapshot_download
5
+
6
+ from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
7
+ from xora.models.transformers.transformer3d import Transformer3DModel
8
+ from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
9
+ from xora.schedulers.rf import RectifiedFlowScheduler
10
+ from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
11
+ from transformers import T5EncoderModel, T5Tokenizer
12
+ from xora.utils.conditioning_method import ConditioningMethod
13
+ from pathlib import Path
14
+ import safetensors.torch
15
+ import json
16
+ import numpy as np
17
+ import cv2
18
+ from PIL import Image
19
+ import tempfile
20
+ import os
21
+
22
+ # Load Hugging Face token if needed
23
+ hf_token = os.getenv("HF_TOKEN")
24
+
25
+ # Set model download directory within Hugging Face Spaces
26
+ model_path = "asset"
27
+ if not os.path.exists(model_path):
28
+ snapshot_download("Lightricks/Xora", local_dir=model_path, repo_type='model', token=hf_token)
29
+
30
+ # Global variables to load components
31
+ vae_dir = Path(model_path) / 'vae'
32
+ unet_dir = Path(model_path) / 'unet'
33
+ scheduler_dir = Path(model_path) / 'scheduler'
34
+
35
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+
37
+
38
+ def load_vae(vae_dir):
39
+ vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
40
+ vae_config_path = vae_dir / "config.json"
41
+ with open(vae_config_path, 'r') as f:
42
+ vae_config = json.load(f)
43
+ vae = CausalVideoAutoencoder.from_config(vae_config)
44
+ vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
45
+ vae.load_state_dict(vae_state_dict)
46
+ return vae.cuda().to(torch.bfloat16)
47
+
48
+
49
+ def load_unet(unet_dir):
50
+ unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
51
+ unet_config_path = unet_dir / "config.json"
52
+ transformer_config = Transformer3DModel.load_config(unet_config_path)
53
+ transformer = Transformer3DModel.from_config(transformer_config)
54
+ unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
55
+ transformer.load_state_dict(unet_state_dict, strict=True)
56
+ return transformer.to(device)
57
+
58
+
59
+ def load_scheduler(scheduler_dir):
60
+ scheduler_config_path = scheduler_dir / "scheduler_config.json"
61
+ scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
62
+ return RectifiedFlowScheduler.from_config(scheduler_config)
63
+
64
+
65
+ # Helper function for image processing
66
+ def center_crop_and_resize(frame, target_height, target_width):
67
+ h, w, _ = frame.shape
68
+ aspect_ratio_target = target_width / target_height
69
+ aspect_ratio_frame = w / h
70
+ if aspect_ratio_frame > aspect_ratio_target:
71
+ new_width = int(h * aspect_ratio_target)
72
+ x_start = (w - new_width) // 2
73
+ frame_cropped = frame[:, x_start:x_start + new_width]
74
+ else:
75
+ new_height = int(w / aspect_ratio_target)
76
+ y_start = (h - new_height) // 2
77
+ frame_cropped = frame[y_start:y_start + new_height, :]
78
+ frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
79
+ return frame_resized
80
+
81
+
82
+ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
83
+ image = Image.open(image_path).convert("RGB")
84
+ image_np = np.array(image)
85
+ frame_resized = center_crop_and_resize(image_np, target_height, target_width)
86
+ frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float()
87
+ frame_tensor = (frame_tensor / 127.5) - 1.0
88
+ return frame_tensor.unsqueeze(0).unsqueeze(2)
89
+
90
+
91
+ # Preset options for resolution and frame configuration
92
+ preset_options = [
93
+ {"label": "704x1216, 41 frames", "height": 704, "width": 1216, "num_frames": 41},
94
+ {"label": "704x1088, 49 frames", "height": 704, "width": 1088, "num_frames": 49},
95
+ {"label": "640x1056, 57 frames", "height": 640, "width": 1056, "num_frames": 57},
96
+ {"label": "608x992, 65 frames", "height": 608, "width": 992, "num_frames": 65},
97
+ {"label": "608x896, 73 frames", "height": 608, "width": 896, "num_frames": 73},
98
+ {"label": "544x896, 81 frames", "height": 544, "width": 896, "num_frames": 81},
99
+ {"label": "544x832, 89 frames", "height": 544, "width": 832, "num_frames": 89},
100
+ {"label": "512x800, 97 frames", "height": 512, "width": 800, "num_frames": 97},
101
+ {"label": "512x768, 97 frames", "height": 512, "width": 768, "num_frames": 97},
102
+ {"label": "480x800, 105 frames", "height": 480, "width": 800, "num_frames": 105},
103
+ {"label": "480x736, 113 frames", "height": 480, "width": 736, "num_frames": 113},
104
+ {"label": "480x704, 121 frames", "height": 480, "width": 704, "num_frames": 121},
105
+ {"label": "448x704, 129 frames", "height": 448, "width": 704, "num_frames": 129},
106
+ {"label": "448x672, 137 frames", "height": 448, "width": 672, "num_frames": 137},
107
+ {"label": "416x640, 153 frames", "height": 416, "width": 640, "num_frames": 153},
108
+ {"label": "384x672, 161 frames", "height": 384, "width": 672, "num_frames": 161},
109
+ {"label": "384x640, 169 frames", "height": 384, "width": 640, "num_frames": 169},
110
+ {"label": "384x608, 177 frames", "height": 384, "width": 608, "num_frames": 177},
111
+ {"label": "384x576, 185 frames", "height": 384, "width": 576, "num_frames": 185},
112
+ {"label": "352x608, 193 frames", "height": 352, "width": 608, "num_frames": 193},
113
+ {"label": "352x576, 201 frames", "height": 352, "width": 576, "num_frames": 201},
114
+ {"label": "352x544, 209 frames", "height": 352, "width": 544, "num_frames": 209},
115
+ {"label": "352x512, 225 frames", "height": 352, "width": 512, "num_frames": 225},
116
+ {"label": "352x512, 233 frames", "height": 352, "width": 512, "num_frames": 233},
117
+ {"label": "320x544, 241 frames", "height": 320, "width": 544, "num_frames": 241},
118
+ {"label": "320x512, 249 frames", "height": 320, "width": 512, "num_frames": 249},
119
+ {"label": "320x512, 257 frames", "height": 320, "width": 512, "num_frames": 257},
120
+ {"label": "Custom", "height": None, "width": None, "num_frames": None}
121
+ ]
122
+
123
+
124
+ # Function to toggle visibility of sliders based on preset selection
125
+ def preset_changed(preset):
126
+ if preset != "Custom":
127
+ selected = next(item for item in preset_options if item["label"] == preset)
128
+ return (
129
+ selected["height"],
130
+ selected["width"],
131
+ selected["num_frames"],
132
+ gr.update(visible=False),
133
+ gr.update(visible=False),
134
+ gr.update(visible=False)
135
+ )
136
+ else:
137
+ return None, None, None, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
138
+
139
+
140
+ # Load models
141
+ vae = load_vae(vae_dir)
142
+ unet = load_unet(unet_dir)
143
+ scheduler = load_scheduler(scheduler_dir)
144
+ patchifier = SymmetricPatchifier(patch_size=1)
145
+ text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(device)
146
+ tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
147
+
148
+ pipeline = XoraVideoPipeline(
149
+ transformer=unet,
150
+ patchifier=patchifier,
151
+ text_encoder=text_encoder,
152
+ tokenizer=tokenizer,
153
+ scheduler=scheduler,
154
+ vae=vae,
155
+ ).to(device)
156
+
157
+
158
+ # Modified function to include validation with gr.Error
159
+ @spaces.GPU(duration=120)
160
+ def generate_video(image_path=None, prompt="", negative_prompt="",
161
+ seed=171198, num_inference_steps=40, num_images_per_prompt=1,
162
+ guidance_scale=3, height=512, width=768, num_frames=121, frame_rate=25, progress=gr.Progress()):
163
+ # Check prompt length and raise an error if it's too short
164
+ if len(prompt.strip()) < 50:
165
+ raise gr.Error("Prompt must be at least 50 characters long. Please provide more details for the best results.", duration=5)
166
+
167
+ if image_path:
168
+ media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device)
169
+ else:
170
+ raise ValueError("Image path must be provided.")
171
+
172
+ sample = {
173
+ "prompt": prompt,
174
+ 'prompt_attention_mask': None,
175
+ 'negative_prompt': negative_prompt,
176
+ 'negative_prompt_attention_mask': None,
177
+ 'media_items': media_items,
178
+ }
179
+
180
+ generator = torch.Generator(device="cpu").manual_seed(seed)
181
+
182
+ def gradio_progress_callback(self, step, timestep, kwargs):
183
+ progress((step + 1) / num_inference_steps)
184
+
185
+ images = pipeline(
186
+ num_inference_steps=num_inference_steps,
187
+ num_images_per_prompt=num_images_per_prompt,
188
+ guidance_scale=guidance_scale,
189
+ generator=generator,
190
+ output_type="pt",
191
+ height=height,
192
+ width=width,
193
+ num_frames=num_frames,
194
+ frame_rate=frame_rate,
195
+ **sample,
196
+ is_video=True,
197
+ vae_per_channel_normalize=True,
198
+ conditioning_method=ConditioningMethod.FIRST_FRAME,
199
+ mixed_precision=True,
200
+ callback_on_step_end=gradio_progress_callback
201
+ ).images
202
+
203
+ output_path = tempfile.mktemp(suffix=".mp4")
204
+ video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
205
+ video_np = (video_np * 255).astype(np.uint8)
206
+ height, width = video_np.shape[1:3]
207
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (width, height))
208
+ for frame in video_np[..., ::-1]:
209
+ out.write(frame)
210
+ out.release()
211
+
212
+ return output_path
213
+
214
+
215
+ # Define the Gradio interface with presets
216
+ with gr.Blocks() as iface:
217
+ gr.Markdown("# Video Generation with Xora")
218
+
219
+ with gr.Row():
220
+ with gr.Column():
221
+ image_input = gr.Image(type="filepath", label="Image Input")
222
+ prompt = gr.Textbox(label="Prompt", value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along. The rider is dressed in a black leather jacket and helmet, leaning slightly forward as the wind rustles through nearby trees. The wheels kick up dust, creating a slight trail behind the motorcycle, adding a sense of speed and excitement to the scene.")
223
+ negative_prompt = gr.Textbox(label="Negative Prompt", value="worst quality, inconsistent motion...")
224
+
225
+ # Preset dropdown for resolution and frame settings
226
+ preset_dropdown = gr.Dropdown(
227
+ choices=[p["label"] for p in preset_options],
228
+ value="704x1216, 41 frames",
229
+ label="Resolution Preset"
230
+ )
231
+
232
+ # Advanced options section
233
+ with gr.Accordion("Advanced Options", open=False):
234
+ seed = gr.Slider(label="Seed", minimum=0, maximum=1000000, step=1, value=171198)
235
+ inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=40)
236
+ images_per_prompt = gr.Slider(label="Images per Prompt", minimum=1, maximum=10, step=1, value=1)
237
+ guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0)
238
+
239
+ # Sliders to appear at the end of the advanced settings
240
+ height_slider = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=704, visible=False)
241
+ width_slider = gr.Slider(label="Width", minimum=256, maximum=1024, step=64, value=1216, visible=False)
242
+ num_frames_slider = gr.Slider(label="Number of Frames", minimum=1, maximum=200, step=1, value=41,
243
+ visible=False)
244
+
245
+ frame_rate = gr.Slider(label="Frame Rate", minimum=1, maximum=60, step=1, value=25, visible=False)
246
+
247
+ generate_button = gr.Button("Generate Video")
248
+
249
+ with gr.Column():
250
+ output_video = gr.Video(label="Generated Video")
251
+
252
+ # Link dropdown change to update sliders visibility and values
253
+ preset_dropdown.change(
254
+ fn=preset_changed,
255
+ inputs=[preset_dropdown],
256
+ outputs=[height_slider, width_slider, num_frames_slider, height_slider, width_slider, frame_rate]
257
+ )
258
+
259
+ generate_button.click(
260
+ fn=generate_video,
261
+ inputs=[image_input, prompt, negative_prompt, seed, inference_steps, images_per_prompt, guidance_scale,
262
+ height_slider, width_slider, num_frames_slider, frame_rate],
263
+ outputs=output_video
264
+ )
265
+
266
+ iface.launch(share=True)