Spaces:

aiqtech
/

cinevid

Running on Zero

App Files Files Community

aiqtech commited on Sep 3, 2024

Commit

2d89730

verified ·

1 Parent(s): ac9a6d7

Update demo.py

Browse files

Files changed (1) hide show

demo.py +38 -78

demo.py CHANGED Viewed

@@ -4,8 +4,7 @@ import torch
 import argparse
 import spaces
 import torchvision
 from pipelines.pipeline_videogen import VideoGenPipeline
 from diffusers.schedulers import DDIMScheduler
 from diffusers.models import AutoencoderKL
@@ -27,7 +26,15 @@ from copy import deepcopy
 import requests
 from datetime import datetime
 import random
 parser = argparse.ArgumentParser()
 parser.add_argument("--config", type=str, default="./configs/sample.yaml")
 args = parser.parse_args()
@@ -35,7 +42,7 @@ args = OmegaConf.load(args.config)
 torch.set_grad_enabled(False)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype = torch.float16 # torch.float16
 unet = get_models(args).to(device, dtype=dtype)
@@ -49,15 +56,14 @@ else:
     vae_for_base_content = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae",).to(device, dtype=torch.float64)
     vae = deepcopy(vae_for_base_content).to(dtype=dtype)
 tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
-text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder", torch_dtype=dtype).to(device) # huge
-# set eval mode
 unet.eval()
 vae.eval()
 text_encoder.eval()
-basedir        = os.getcwd()
-savedir        = os.path.join(basedir, "samples/Gradio", datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
 savedir_sample = os.path.join(savedir, "sample")
 os.makedirs(savedir, exist_ok=True)
@@ -66,56 +72,55 @@ def update_and_resize_image(input_image_path, height_slider, width_slider):
         pil_image = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
     else:
         pil_image = Image.open(input_image_path).convert('RGB')
     original_width, original_height = pil_image.size
     if original_height == height_slider and original_width == width_slider:
         return gr.Image(value=np.array(pil_image))
     ratio1 = height_slider / original_height
     ratio2 = width_slider / original_width
     if ratio1 > ratio2:
         new_width = int(original_width * ratio1)
         new_height = int(original_height * ratio1)
     else:
         new_width = int(original_width * ratio2)
         new_height = int(original_height * ratio2)
     pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
     left = (new_width - width_slider) / 2
     top = (new_height - height_slider) / 2
     right = left + width_slider
     bottom = top + height_slider
     pil_image = pil_image.crop((left, top, right, bottom))
-    return gr.Image(value=np.array(pil_image))
 def update_textbox_and_save_image(input_image, height_slider, width_slider):
     pil_image = Image.fromarray(input_image.astype(np.uint8)).convert("RGB")
     original_width, original_height = pil_image.size
     ratio1 = height_slider / original_height
     ratio2 = width_slider / original_width
     if ratio1 > ratio2:
         new_width = int(original_width * ratio1)
         new_height = int(original_height * ratio1)
     else:
         new_width = int(original_width * ratio2)
         new_height = int(original_height * ratio2)
     pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
     left = (new_width - width_slider) / 2
     top = (new_height - height_slider) / 2
     right = left + width_slider
     bottom = top + height_slider
     pil_image = pil_image.crop((left, top, right, bottom))
     img_path = os.path.join(savedir, "input_image.png")
@@ -130,10 +135,9 @@ def prepare_image(image, vae, transform_video, device, dtype=torch.float16):
     image = image.unsqueeze(2)
     return image
 @spaces.GPU
-def gen_video(input_image, prompt, negative_prompt, diffusion_step, height, width, scfg_scale, use_dctinit, dct_coefficients, noise_level, motion_bucket_id, seed):
     torch.manual_seed(seed)
     scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_path,
@@ -147,7 +151,6 @@ def gen_video(input_image, prompt, negative_prompt, diffusion_step, height, widt
                                          tokenizer=tokenizer,
                                          scheduler=scheduler,
                                          unet=unet).to(device)
-    # videogen_pipeline.enable_xformers_memory_efficient_attention()
     transform_video = transforms.Compose([
         video_transforms.ToTensorVideo(),
@@ -160,33 +163,25 @@ def gen_video(input_image, prompt, negative_prompt, diffusion_step, height, widt
         base_content = prepare_image(input_image, vae_for_base_content, transform_video, device, dtype=torch.float16).to(device)
     if use_dctinit:
-        # filter params
-        print("Using DCT!")
         base_content_repeat = repeat(base_content, 'b c f h w -> b c (f r) h w', r=15).contiguous()
-        # define filter
         freq_filter = dct_low_pass_filter(dct_coefficients=base_content, percentage=dct_coefficients)
-        noise = torch.randn(1, 4, 15, 40, 64).to(device)
-        # add noise to base_content
-        diffuse_timesteps = torch.full((1,),int(noise_level))
         diffuse_timesteps = diffuse_timesteps.long()
-        # 3d content
         base_content_noise = scheduler.add_noise(
             original_samples=base_content_repeat.to(device),
             noise=noise,
             timesteps=diffuse_timesteps.to(device))
-        # 3d content
         latents = exchanged_mixed_dct_freq(noise=noise,
                     base_content=base_content_noise,
                     LPF_3d=freq_filter).to(dtype=torch.float16)
     base_content = base_content.to(dtype=torch.float16)
-    videos = videogen_pipeline(prompt,
                                negative_prompt=negative_prompt,
                                latents=latents if use_dctinit else None,
                                base_content=base_content,
@@ -197,13 +192,11 @@ def gen_video(input_image, prompt, negative_prompt, diffusion_step, height, widt
                                guidance_scale=scfg_scale,
                                motion_bucket_id=100-motion_bucket_id,
                                enable_vae_temporal_decoder=args.enable_vae_temporal_decoder).video
     save_path = args.save_img_path + 'temp' + '.mp4'
-    # torchvision.io.write_video(save_path, videos[0], fps=8, video_codec='h264', options={'crf': '10'})
     imageio.mimwrite(save_path, videos[0], fps=8, quality=7)
     return save_path
 if not os.path.exists(args.save_img_path):
     os.makedirs(args.save_img_path)
@@ -215,11 +208,9 @@ footer {
 with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
     with gr.Column(variant="panel"):
         with gr.Row():
-            prompt_textbox = gr.Textbox(label="Prompt", lines=1)
             negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=1)
         with gr.Row(equal_height=False):
@@ -231,13 +222,7 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
         generate_button = gr.Button(value="Generate", variant='primary')
         with gr.Accordion("Advanced options", open=False):
-            gr.Markdown(
-            """
-            - Input image can be specified using the "Input Image URL" text box or uploaded by clicking or dragging the image to the "Input Image" box.
-            - Input image will be resized and/or center cropped to a given resolution (320 x 512) automatically.
-            - After setting the input image path, press the "Preview" button to visualize the resized input image.
-            """
-            )
             with gr.Column():
                 with gr.Row():
                     input_image_path = gr.Textbox(label="Input Image URL", lines=1, scale=10, info="Press Enter or the Preview button to confirm the input image.")
@@ -248,9 +233,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
                 with gr.Row():
                     seed_textbox = gr.Slider(label="Seed", value=100, minimum=1, maximum=int(1e8), step=1, interactive=True)
-                    # seed_textbox = gr.Textbox(label="Seed", value=100)
-                    # seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
-                    # seed_button.click(fn=lambda: gr.Textbox(value=random.randint(1, int(1e8))), inputs=[], outputs=[seed_textbox])
                 with gr.Row():
                     height = gr.Slider(label="Height", value=320, minimum=0, maximum=512, step=16, interactive=False)
@@ -268,28 +250,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
         preview_button.click(fn=update_and_resize_image, inputs=[input_image_path, height, width], outputs=[input_image])
         input_image_path.submit(fn=update_and_resize_image, inputs=[input_image_path, height, width], outputs=[input_image])
-        EXAMPLES = [
-            ["./example/red_panda_eating_bamboo/0.jpg", "red panda eating bamboo"     , "low quality", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
-            ["./example/fireworks/0.jpg", "fireworks"                                 , "low quality", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
-            ["./example/flowers_swaying/0.jpg", "flowers swaying"                     , "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
-            ["./example/girl_walking_on_the_beach/0.jpg", "girl walking on the beach" , "low quality, background changing", 50, 320, 512, 7.5, True, 0.25, 995, 10, 49494220],
-            ["./example/house_rotating/0.jpg", "house rotating"                       , "low quality", 50, 320, 512, 7.5, True, 0.23, 985, 10, 46640174],
-            ["./example/people_runing/0.jpg", "people runing"                         , "low quality, background changing", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
-            ["./example/shark_swimming/0.jpg", "shark swimming"                       , "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 32947978],
-            ["./example/car_moving/0.jpg", "car moving"                               , "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 75469653],
-            ["./example/windmill_turning/0.jpg", "windmill turning"                   , "background changing", 50, 320, 512, 7.5, True, 0.21, 975, 10, 89378613],
-]
-        examples = gr.Examples(
-            examples = EXAMPLES,
-            fn = gen_video,
-            inputs=[input_image, prompt_textbox, negative_prompt_textbox, sample_step_slider, height, width, txt_cfg_scale, use_dctinit, dct_coefficients, noise_level, motion_bucket_id, seed_textbox],
-            outputs=[result_video],
-            cache_examples=True,
-            # cache_examples="lazy",
-        )
         generate_button.click(
                 fn=gen_video,
                 inputs=[
@@ -309,4 +269,4 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
                 outputs=[result_video]
             )
-demo.launch(debug=False, share=True)

 import argparse
 import spaces
 import torchvision
+from transformers import pipeline
 from pipelines.pipeline_videogen import VideoGenPipeline
 from diffusers.schedulers import DDIMScheduler
 from diffusers.models import AutoencoderKL
 import requests
 from datetime import datetime
 import random
+# 번역 파이프라인 생성
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+# 번역 함수
+def translate_prompt(korean_prompt):
+    translation = translator(korean_prompt, max_length=512)
+    return translation[0]['translation_text']
 parser = argparse.ArgumentParser()
 parser.add_argument("--config", type=str, default="./configs/sample.yaml")
 args = parser.parse_args()
 torch.set_grad_enabled(False)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16
 unet = get_models(args).to(device, dtype=dtype)
     vae_for_base_content = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae",).to(device, dtype=torch.float64)
     vae = deepcopy(vae_for_base_content).to(dtype=dtype)
 tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder", torch_dtype=dtype).to(device)
 unet.eval()
 vae.eval()
 text_encoder.eval()
+basedir = os.getcwd()
+savedir = os.path.join(basedir, "samples/Gradio", datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
 savedir_sample = os.path.join(savedir, "sample")
 os.makedirs(savedir, exist_ok=True)
         pil_image = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
     else:
         pil_image = Image.open(input_image_path).convert('RGB')
     original_width, original_height = pil_image.size
     if original_height == height_slider and original_width == width_slider:
         return gr.Image(value=np.array(pil_image))
     ratio1 = height_slider / original_height
     ratio2 = width_slider / original_width
     if ratio1 > ratio2:
         new_width = int(original_width * ratio1)
         new_height = int(original_height * ratio1)
     else:
         new_width = int(original_width * ratio2)
         new_height = int(original_height * ratio2)
     pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
     left = (new_width - width_slider) / 2
     top = (new_height - height_slider) / 2
     right = left + width_slider
     bottom = top + height_slider
     pil_image = pil_image.crop((left, top, right, bottom))
+    return gr.Image(value=np.array(pil_image))
 def update_textbox_and_save_image(input_image, height_slider, width_slider):
     pil_image = Image.fromarray(input_image.astype(np.uint8)).convert("RGB")
     original_width, original_height = pil_image.size
     ratio1 = height_slider / original_height
     ratio2 = width_slider / original_width
     if ratio1 > ratio2:
         new_width = int(original_width * ratio1)
         new_height = int(original_height * ratio1)
     else:
         new_width = int(original_width * ratio2)
         new_height = int(original_height * ratio2)
     pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
     left = (new_width - width_slider) / 2
     top = (new_height - height_slider) / 2
     right = left + width_slider
     bottom = top + height_slider
     pil_image = pil_image.crop((left, top, right, bottom))
     img_path = os.path.join(savedir, "input_image.png")
     image = image.unsqueeze(2)
     return image
 @spaces.GPU
+def gen_video(input_image, korean_prompt, negative_prompt, diffusion_step, height, width, scfg_scale, use_dctinit, dct_coefficients, noise_level, motion_bucket_id, seed):
+    english_prompt = translate_prompt(korean_prompt)
     torch.manual_seed(seed)
     scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_path,
                                          tokenizer=tokenizer,
                                          scheduler=scheduler,
                                          unet=unet).to(device)
     transform_video = transforms.Compose([
         video_transforms.ToTensorVideo(),
         base_content = prepare_image(input_image, vae_for_base_content, transform_video, device, dtype=torch.float16).to(device)
     if use_dctinit:
         base_content_repeat = repeat(base_content, 'b c f h w -> b c (f r) h w', r=15).contiguous()
         freq_filter = dct_low_pass_filter(dct_coefficients=base_content, percentage=dct_coefficients)
+        noise = torch.randn(1, 4, 15, 40, 64).to(device)
+        diffuse_timesteps = torch.full((1,), int(noise_level))
         diffuse_timesteps = diffuse_timesteps.long()
         base_content_noise = scheduler.add_noise(
             original_samples=base_content_repeat.to(device),
             noise=noise,
             timesteps=diffuse_timesteps.to(device))
         latents = exchanged_mixed_dct_freq(noise=noise,
                     base_content=base_content_noise,
                     LPF_3d=freq_filter).to(dtype=torch.float16)
     base_content = base_content.to(dtype=torch.float16)
+    videos = videogen_pipeline(english_prompt,
                                negative_prompt=negative_prompt,
                                latents=latents if use_dctinit else None,
                                base_content=base_content,
                                guidance_scale=scfg_scale,
                                motion_bucket_id=100-motion_bucket_id,
                                enable_vae_temporal_decoder=args.enable_vae_temporal_decoder).video
     save_path = args.save_img_path + 'temp' + '.mp4'
     imageio.mimwrite(save_path, videos[0], fps=8, quality=7)
     return save_path
 if not os.path.exists(args.save_img_path):
     os.makedirs(args.save_img_path)
 with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
     with gr.Column(variant="panel"):
         with gr.Row():
+            prompt_textbox = gr.Textbox(label="Korean Prompt", lines=1)
             negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=1)
         with gr.Row(equal_height=False):
         generate_button = gr.Button(value="Generate", variant='primary')
         with gr.Accordion("Advanced options", open=False):
             with gr.Column():
                 with gr.Row():
                     input_image_path = gr.Textbox(label="Input Image URL", lines=1, scale=10, info="Press Enter or the Preview button to confirm the input image.")
                 with gr.Row():
                     seed_textbox = gr.Slider(label="Seed", value=100, minimum=1, maximum=int(1e8), step=1, interactive=True)
                 with gr.Row():
                     height = gr.Slider(label="Height", value=320, minimum=0, maximum=512, step=16, interactive=False)
         preview_button.click(fn=update_and_resize_image, inputs=[input_image_path, height, width], outputs=[input_image])
         input_image_path.submit(fn=update_and_resize_image, inputs=[input_image_path, height, width], outputs=[input_image])
         generate_button.click(
                 fn=gen_video,
                 inputs=[
                 outputs=[result_video]
             )
+demo.launch(debug=False, share=True)