import gradio as gr import argparse import os from musepose_inference import MusePoseInference from pose_align import PoseAlignmentInference from downloading_weights import download_models class App: def __init__(self, args): self.args = args self.pose_alignment_infer = PoseAlignmentInference( model_dir=args.model_dir, output_dir=args.output_dir ) self.musepose_infer = MusePoseInference( model_dir=args.model_dir, output_dir=args.output_dir ) if not args.disable_model_download_at_start: download_models(model_dir=args.model_dir) @staticmethod def on_step1_complete(input_img: str, input_pose_vid: str): return [gr.Image(label="Input Image", value=input_img, type="filepath", scale=5), gr.Video(label="Input Aligned Pose Video", value=input_pose_vid, scale=5)] def musepose_demo(self): with gr.Blocks() as demo: md_header = self.header() with gr.Tabs(): with gr.TabItem('1: Pose Alignment'): with gr.Row(): with gr.Column(scale=3): img_pose_input = gr.Image(label="Input Image", type="filepath", scale=5) vid_dance_input = gr.Video(label="Input Dance Video", max_length=4, scale=5) with gr.Column(scale=3): vid_dance_output = gr.Video(label="Aligned Pose Output", scale=5, interactive=False) vid_dance_output_demo = gr.Video(label="Aligned Pose Output Demo", scale=5) with gr.Column(scale=3): with gr.Column(): nb_detect_resolution = gr.Number(label="Detect Resolution", value=512, precision=0) nb_image_resolution = gr.Number(label="Image Resolution.", value=720, precision=0) nb_align_frame = gr.Number(label="Align Frame", value=0, precision=0) nb_max_frame = gr.Number(label="Max Frame", value=300, precision=0) with gr.Row(): btn_align_pose = gr.Button("ALIGN POSE", variant="primary") with gr.Column(): examples = [ [os.path.join("examples", "dance.mp4"), os.path.join("examples", "ref.png"), 512, 720, 0, 300]] ex_step1 = gr.Examples(examples=examples, inputs=[vid_dance_input, img_pose_input, nb_detect_resolution, nb_image_resolution, nb_align_frame, nb_max_frame], outputs=[vid_dance_output, vid_dance_output_demo], fn=self.pose_alignment_infer.align_pose, cache_examples="lazy") btn_align_pose.click(fn=self.pose_alignment_infer.align_pose, inputs=[vid_dance_input, img_pose_input, nb_detect_resolution, nb_image_resolution, nb_align_frame, nb_max_frame], outputs=[vid_dance_output, vid_dance_output_demo]) with gr.TabItem('2: MusePose Inference'): with gr.Row(): with gr.Column(scale=3): img_musepose_input = gr.Image(label="Input Image", type="filepath", scale=5) vid_pose_input = gr.Video(label="Input Aligned Pose Video", max_length=4, scale=5) with gr.Column(scale=3): vid_output = gr.Video(label="MusePose Output", scale=5) vid_output_demo = gr.Video(label="MusePose Output Demo", scale=5) with gr.Column(scale=3): with gr.Column(): weight_dtype = gr.Dropdown(label="Compute Type", choices=["fp16", "fp32"], value="fp16") nb_width = gr.Number(label="Width.", value=512, precision=0) nb_height = gr.Number(label="Height.", value=512, precision=0) nb_video_frame_length = gr.Number(label="Video Frame Length", value=300, precision=0) nb_video_slice_frame_length = gr.Number(label="Video Slice Frame Number ", value=48, precision=0) nb_video_slice_overlap_frame_number = gr.Number( label="Video Slice Overlap Frame Number", value=4, precision=0) nb_cfg = gr.Number(label="CFG (Classifier Free Guidance)", value=3.5, precision=0) nb_seed = gr.Number(label="Seed", value=99, precision=0) nb_steps = gr.Number(label="DDIM Sampling Steps", value=20, precision=0) nb_fps = gr.Number(label="FPS (Frames Per Second) ", value=-1, precision=0, info="Set to '-1' to use same FPS with pose's") nb_skip = gr.Number(label="SKIP (Frame Sample Rate = SKIP+1)", value=1, precision=0) with gr.Row(): btn_generate = gr.Button("GENERATE", variant="primary") btn_generate.click(fn=self.musepose_infer.infer_musepose, inputs=[img_musepose_input, vid_pose_input, weight_dtype, nb_width, nb_height, nb_video_frame_length, nb_video_slice_frame_length, nb_video_slice_overlap_frame_number, nb_cfg, nb_seed, nb_steps, nb_fps, nb_skip], outputs=[vid_output, vid_output_demo]) vid_dance_output.change(fn=self.on_step1_complete, inputs=[img_pose_input, vid_dance_output], outputs=[img_musepose_input, vid_pose_input]) return demo @staticmethod def header(): header = gr.HTML( """
Note: This space only allows video input up to 3 seconds because ZeroGPU limits the function runtime to 2 minutes.
If you want longer video inputs, you have to run it locally. Click the link above and follow the README to try it locally.
When you have completed the 1: Pose Alignment process, go to 2: MusePose Inference and click the "GENERATE" button.