import torch import gradio as gr from diffusers import AnimateDiffPipeline, MotionAdapter, DPMSolverMultistepScheduler, AutoencoderKL, SparseControlNetModel from diffusers.utils import export_to_gif, load_image from transformers import pipeline device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 한글-영어 번역 모델 로드 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en") def translate_korean_to_english(text): if any('\u3131' <= char <= '\u3163' or '\uac00' <= char <= '\ud7a3' for char in text): translated = translator(text)[0]['translation_text'] return translated return text def generate_video(prompt, negative_prompt, num_inference_steps, conditioning_frame_indices, controlnet_conditioning_scale): prompt = translate_korean_to_english(prompt) negative_prompt = translate_korean_to_english(negative_prompt) motion_adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-3", torch_dtype=torch.float16).to(device) controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-scribble", torch_dtype=torch.float16).to(device) vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to(device) pipe = AnimateDiffPipeline.from_pretrained( "SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=motion_adapter, controlnet=controlnet, vae=vae, torch_dtype=torch.float16, ).to(device) pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, beta_schedule="linear", algorithm_type="dpmsolver++", use_karras_sigmas=True) image_files = [ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png", "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png", "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png" ] conditioning_frames = [load_image(img_file) for img_file in image_files] conditioning_frame_indices = eval(conditioning_frame_indices) controlnet_conditioning_scale = float(controlnet_conditioning_scale) video = pipe( prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, conditioning_frames=conditioning_frames, controlnet_conditioning_scale=controlnet_conditioning_scale, controlnet_frame_indices=conditioning_frame_indices, generator=torch.Generator().manual_seed(1337), ).frames[0] export_to_gif(video, "output.gif") return "output.gif" demo = gr.Interface( fn=generate_video, inputs=[ gr.Textbox(label="Prompt (한글 또는 영어)", value="카페에서 커피 마시는 아름다운 프랑스 여성, 걸작, 고품질"), gr.Textbox(label="Negative Prompt (한글 또는 영어)", value="저품질, 최악의 품질, 레터박스"), gr.Slider(label="Number of Inference Steps", minimum=1, maximum=200, step=1, value=100), gr.Textbox(label="Conditioning Frame Indices", value="[0, 8, 15]"), gr.Slider(label="ControlNet Conditioning Scale", minimum=0.1, maximum=2.0, step=0.1, value=1.0) ], outputs=gr.Image(label="Generated Video"), title="AnimateDiffSparseControlNetPipeline을 사용한 비디오 생성", description="AnimateDiffSparseControlNetPipeline을 사용하여 비디오를 생성합니다. 한글 또는 영어로 프롬프트를 입력할 수 있습니다." ) demo.launch()