Spaces:

Lightricks
/

LTX-Video-Playground

Running on A100

App Files Files Community

benibraz commited on Nov 21, 2024

Commit

fc65614

1 Parent(s): 637b686

different tabs for different functionality

Browse files

Files changed (1) hide show

app.py +246 -71

app.py CHANGED Viewed

@@ -24,12 +24,14 @@ hf_token = os.getenv("HF_TOKEN")
 # Set model download directory within Hugging Face Spaces
 model_path = "asset"
 if not os.path.exists(model_path):
-    snapshot_download("Lightricks/LTX-Video", local_dir=model_path, repo_type='model', token=hf_token)
 # Global variables to load components
-vae_dir = Path(model_path) / 'vae'
-unet_dir = Path(model_path) / 'unet'
-scheduler_dir = Path(model_path) / 'scheduler'
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -37,7 +39,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
-    with open(vae_config_path, 'r') as f:
         vae_config = json.load(f)
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
@@ -69,11 +71,11 @@ def center_crop_and_resize(frame, target_height, target_width):
     if aspect_ratio_frame > aspect_ratio_target:
         new_width = int(h * aspect_ratio_target)
         x_start = (w - new_width) // 2
-        frame_cropped = frame[:, x_start:x_start + new_width]
     else:
         new_height = int(w / aspect_ratio_target)
         y_start = (h - new_height) // 2
-        frame_cropped = frame[y_start:y_start + new_height, :]
     frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
     return frame_resized
@@ -116,7 +118,7 @@ preset_options = [
     {"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241},
     {"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249},
     {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
-    {"label": "Custom", "height": None, "width": None, "num_frames": None}
 ]
@@ -130,10 +132,17 @@ def preset_changed(preset):
             selected["num_frames"],
             gr.update(visible=False),
             gr.update(visible=False),
-            gr.update(visible=False)
         )
     else:
-        return None, None, None, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 # Load models
@@ -141,8 +150,12 @@ vae = load_vae(vae_dir)
 unet = load_unet(unet_dir)
 scheduler = load_scheduler(scheduler_dir)
 patchifier = SymmetricPatchifier(patch_size=1)
-text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(device)
-tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
 pipeline = XoraVideoPipeline(
     transformer=unet,
@@ -154,26 +167,108 @@ pipeline = XoraVideoPipeline(
 ).to(device)
-# Modified function to include validation with gr.Error
-#@spaces.GPU(duration=120)
-def generate_video(image_path=None, prompt="", negative_prompt="",
-                   seed=171198, num_inference_steps=40, num_images_per_prompt=1,
-                   guidance_scale=3, height=512, width=768, num_frames=121, frame_rate=25, progress=gr.Progress()):
-    # Check prompt length and raise an error if it's too short
     if len(prompt.strip()) < 50:
-        raise gr.Error("Prompt must be at least 50 characters long. Please provide more details for the best results.", duration=5)
-    if image_path:
-        media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device)
-    media_items=None
     sample = {
         "prompt": prompt,
-        'prompt_attention_mask': None,
-        'negative_prompt': negative_prompt,
-        'negative_prompt_attention_mask': None,
-        'media_items': media_items,
     }
     generator = torch.Generator(device="cpu").manual_seed(seed)
@@ -196,14 +291,16 @@ def generate_video(image_path=None, prompt="", negative_prompt="",
         vae_per_channel_normalize=True,
         conditioning_method=ConditioningMethod.FIRST_FRAME,
         mixed_precision=True,
-        callback_on_step_end=gradio_progress_callback
     ).images
     output_path = tempfile.mktemp(suffix=".mp4")
     video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
     video_np = (video_np * 255).astype(np.uint8)
     height, width = video_np.shape[1:3]
-    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (width, height))
     for frame in video_np[..., ::-1]:
         out.write(frame)
     out.release()
@@ -211,55 +308,133 @@ def generate_video(image_path=None, prompt="", negative_prompt="",
     return output_path
-# Define the Gradio interface with presets
 with gr.Blocks() as iface:
     gr.Markdown("# Video Generation with LTX Video")
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="filepath", label="Image Input")
-            prompt = gr.Textbox(label="Prompt", value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along. The rider is dressed in a black leather jacket and helmet, leaning slightly forward as the wind rustles through nearby trees. The wheels kick up dust, creating a slight trail behind the motorcycle, adding a sense of speed and excitement to the scene.")
-            negative_prompt = gr.Textbox(label="Negative Prompt", value="worst quality, inconsistent motion...")
-            # Preset dropdown for resolution and frame settings
-            preset_dropdown = gr.Dropdown(
-                choices=[p["label"] for p in preset_options],
-                value="1216x704, 41 frames",
-                label="Resolution Preset"
-            )
-            # Advanced options section
-            with gr.Accordion("Advanced Options", open=False):
-                seed = gr.Slider(label="Seed", minimum=0, maximum=1000000, step=1, value=171198)
-                inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=40)
-                images_per_prompt = gr.Slider(label="Images per Prompt", minimum=1, maximum=10, step=1, value=1)
-                guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0)
-                # Sliders to appear at the end of the advanced settings
-                height_slider = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=704, visible=False)
-                width_slider = gr.Slider(label="Width", minimum=256, maximum=1024, step=64, value=1216, visible=False)
-                num_frames_slider = gr.Slider(label="Number of Frames", minimum=1, maximum=200, step=1, value=41,
-                                              visible=False)
-                frame_rate = gr.Slider(label="Frame Rate", minimum=1, maximum=60, step=1, value=25, visible=False)
-            generate_button = gr.Button("Generate Video")
-        with gr.Column():
-            output_video = gr.Video(label="Generated Video")
-    # Link dropdown change to update sliders visibility and values
-    preset_dropdown.change(
         fn=preset_changed,
-        inputs=[preset_dropdown],
-        outputs=[height_slider, width_slider, num_frames_slider, height_slider, width_slider, frame_rate]
     )
-    generate_button.click(
-        fn=generate_video,
-        inputs=[image_input, prompt, negative_prompt, seed, inference_steps, images_per_prompt, guidance_scale,
-                height_slider, width_slider, num_frames_slider, frame_rate],
-        outputs=output_video
     )
 iface.launch(share=True)

 # Set model download directory within Hugging Face Spaces
 model_path = "asset"
 if not os.path.exists(model_path):
+    snapshot_download(
+        "Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token
+    )
 # Global variables to load components
+vae_dir = Path(model_path) / "vae"
+unet_dir = Path(model_path) / "unet"
+scheduler_dir = Path(model_path) / "scheduler"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
+    with open(vae_config_path, "r") as f:
         vae_config = json.load(f)
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     if aspect_ratio_frame > aspect_ratio_target:
         new_width = int(h * aspect_ratio_target)
         x_start = (w - new_width) // 2
+        frame_cropped = frame[:, x_start : x_start + new_width]
     else:
         new_height = int(w / aspect_ratio_target)
         y_start = (h - new_height) // 2
+        frame_cropped = frame[y_start : y_start + new_height, :]
     frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
     return frame_resized
     {"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241},
     {"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249},
     {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
+    {"label": "Custom", "height": None, "width": None, "num_frames": None},
 ]
             selected["num_frames"],
             gr.update(visible=False),
             gr.update(visible=False),
+            gr.update(visible=False),
         )
     else:
+        return (
+            None,
+            None,
+            None,
+            gr.update(visible=True),
+            gr.update(visible=True),
+            gr.update(visible=True),
+        )
 # Load models
 unet = load_unet(unet_dir)
 scheduler = load_scheduler(scheduler_dir)
 patchifier = SymmetricPatchifier(patch_size=1)
+text_encoder = T5EncoderModel.from_pretrained(
+    "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
+).to(device)
+tokenizer = T5Tokenizer.from_pretrained(
+    "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
+)
 pipeline = XoraVideoPipeline(
     transformer=unet,
 ).to(device)
+import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
+# [Previous imports remain the same...]
+def generate_video_from_text(
+    prompt="",
+    negative_prompt="",
+    seed=171198,
+    num_inference_steps=40,
+    num_images_per_prompt=1,
+    guidance_scale=3,
+    height=512,
+    width=768,
+    num_frames=121,
+    frame_rate=25,
+    progress=gr.Progress(),
+):
+    if len(prompt.strip()) < 50:
+        raise gr.Error(
+            "Prompt must be at least 50 characters long. Please provide more details for the best results.",
+            duration=5,
+        )
+    sample = {
+        "prompt": prompt,
+        "prompt_attention_mask": None,
+        "negative_prompt": negative_prompt,
+        "negative_prompt_attention_mask": None,
+        "media_items": None,
+    }
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+    def gradio_progress_callback(self, step, timestep, kwargs):
+        progress((step + 1) / num_inference_steps)
+    images = pipeline(
+        num_inference_steps=num_inference_steps,
+        num_images_per_prompt=num_images_per_prompt,
+        guidance_scale=guidance_scale,
+        generator=generator,
+        output_type="pt",
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        frame_rate=frame_rate,
+        **sample,
+        is_video=True,
+        vae_per_channel_normalize=True,
+        conditioning_method=ConditioningMethod.FIRST_FRAME,
+        mixed_precision=True,
+        callback_on_step_end=gradio_progress_callback,
+    ).images
+    output_path = tempfile.mktemp(suffix=".mp4")
+    video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
+    video_np = (video_np * 255).astype(np.uint8)
+    height, width = video_np.shape[1:3]
+    out = cv2.VideoWriter(
+        output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
+    )
+    for frame in video_np[..., ::-1]:
+        out.write(frame)
+    out.release()
+    return output_path
+def generate_video_from_image(
+    image_path,
+    prompt="",
+    negative_prompt="",
+    seed=171198,
+    num_inference_steps=40,
+    num_images_per_prompt=1,
+    guidance_scale=3,
+    height=512,
+    width=768,
+    num_frames=121,
+    frame_rate=25,
+    progress=gr.Progress(),
+):
     if len(prompt.strip()) < 50:
+        raise gr.Error(
+            "Prompt must be at least 50 characters long. Please provide more details for the best results.",
+            duration=5,
+        )
+    if not image_path:
+        raise gr.Error("Please provide an input image.", duration=5)
+    media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device)
     sample = {
         "prompt": prompt,
+        "prompt_attention_mask": None,
+        "negative_prompt": negative_prompt,
+        "negative_prompt_attention_mask": None,
+        "media_items": media_items,
     }
     generator = torch.Generator(device="cpu").manual_seed(seed)
         vae_per_channel_normalize=True,
         conditioning_method=ConditioningMethod.FIRST_FRAME,
         mixed_precision=True,
+        callback_on_step_end=gradio_progress_callback,
     ).images
     output_path = tempfile.mktemp(suffix=".mp4")
     video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
     video_np = (video_np * 255).astype(np.uint8)
     height, width = video_np.shape[1:3]
+    out = cv2.VideoWriter(
+        output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
+    )
     for frame in video_np[..., ::-1]:
         out.write(frame)
     out.release()
     return output_path
+def create_advanced_options():
+    with gr.Accordion("Advanced Options", open=False):
+        seed = gr.Slider(label="Seed", minimum=0, maximum=1000000, step=1, value=171198)
+        inference_steps = gr.Slider(
+            label="Inference Steps", minimum=1, maximum=100, step=1, value=40
+        )
+        images_per_prompt = gr.Slider(
+            label="Images per Prompt", minimum=1, maximum=10, step=1, value=1
+        )
+        guidance_scale = gr.Slider(
+            label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0
+        )
+        height_slider = gr.Slider(
+            label="Height", minimum=256, maximum=1024, step=64, value=704, visible=False
+        )
+        width_slider = gr.Slider(
+            label="Width", minimum=256, maximum=1024, step=64, value=1216, visible=False
+        )
+        num_frames_slider = gr.Slider(
+            label="Number of Frames",
+            minimum=1,
+            maximum=200,
+            step=1,
+            value=41,
+            visible=False,
+        )
+        frame_rate = gr.Slider(
+            label="Frame Rate", minimum=1, maximum=60, step=1, value=25, visible=False
+        )
+        return [
+            seed,
+            inference_steps,
+            images_per_prompt,
+            guidance_scale,
+            height_slider,
+            width_slider,
+            num_frames_slider,
+            frame_rate,
+        ]
+# Define the Gradio interface with tabs
 with gr.Blocks() as iface:
     gr.Markdown("# Video Generation with LTX Video")
+    with gr.Tabs():
+        with gr.TabItem("Text to Video"):
+            with gr.Row():
+                with gr.Column():
+                    txt2vid_prompt = gr.Textbox(
+                        label="Prompt",
+                        value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along. The rider is dressed in a black leather jacket and helmet, leaning slightly forward as the wind rustles through nearby trees. The wheels kick up dust, creating a slight trail behind the motorcycle, adding a sense of speed and excitement to the scene.",
+                    )
+                    txt2vid_negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="worst quality, inconsistent motion...",
+                    )
+                    # Preset dropdown for resolution and frame settings
+                    txt2vid_preset = gr.Dropdown(
+                        choices=[p["label"] for p in preset_options],
+                        value="1216x704, 41 frames",
+                        label="Resolution Preset",
+                    )
+                    txt2vid_advanced = create_advanced_options()
+                    txt2vid_generate = gr.Button("Generate Video")
+                with gr.Column():
+                    txt2vid_output = gr.Video(label="Generated Video")
+        with gr.TabItem("Image to Video"):
+            with gr.Row():
+                with gr.Column():
+                    img2vid_image = gr.Image(type="filepath", label="Input Image")
+                    img2vid_prompt = gr.Textbox(
+                        label="Prompt",
+                        value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains...",
+                    )
+                    img2vid_negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="worst quality, inconsistent motion...",
+                    )
+                    img2vid_preset = gr.Dropdown(
+                        choices=[p["label"] for p in preset_options],
+                        value="1216x704, 41 frames",
+                        label="Resolution Preset",
+                    )
+                    img2vid_advanced = create_advanced_options()
+                    img2vid_generate = gr.Button("Generate Video")
+                with gr.Column():
+                    img2vid_output = gr.Video(label="Generated Video")
+    # Event handlers for text-to-video tab
+    txt2vid_preset.change(
+        fn=preset_changed,
+        inputs=[txt2vid_preset],
+        outputs=txt2vid_advanced[4:],  # height, width, num_frames, and their visibility
+    )
+    txt2vid_generate.click(
+        fn=generate_video_from_text,
+        inputs=[txt2vid_prompt, txt2vid_negative_prompt, *txt2vid_advanced],
+        outputs=txt2vid_output,
+    )
+    # Event handlers for image-to-video tab
+    img2vid_preset.change(
         fn=preset_changed,
+        inputs=[img2vid_preset],
+        outputs=img2vid_advanced[4:],  # height, width, num_frames, and their visibility
     )
+    img2vid_generate.click(
+        fn=generate_video_from_image,
+        inputs=[
+            img2vid_image,
+            img2vid_prompt,
+            img2vid_negative_prompt,
+            *img2vid_advanced,
+        ],
+        outputs=img2vid_output,
     )
 iface.launch(share=True)