File size: 3,590 Bytes
8131f9b
 
 
2d9a728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

import os
import sys
import gradio as gr

# prototyping
# from demo_test import Text2Video, Video2Video

from demo.t2v import Text2Video

t2v_examples = [
    ['walk fast clean',16,],
    ['run fast clean',16,],
    ['standing up',16],
    ['doing the splits',16],
    ['doing backflips',16],
    ['a headstand',16],
    ['karate kick',16],
    ['crunch abs',16],
    ['doing push ups',16],
]

def do_nothing():
    return

def videocrafter_demo(result_dir='./tmp/'):
    text2video = Text2Video(result_dir)
    # video2video = Video2Video(result_dir)

    # tex
    with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
        gr.Markdown("<div align='center'> <h2> GenRL: Multimodal foundation world models for generalist embodied agents </span> </h2> \
                     <a style='font-size:18px;' href='https://github.com/mazpie/genrl'> [Github] \
                    &nbsp; &nbsp; \
                     <a style='font-size:18px;' href='https://huggingface.co/mazpie/genrl_models'> [Models] </div> \
                    &nbsp; &nbsp; \
                     <a style='font-size:18px;' href='https://huggingface.co/mazpie/genrl_models'> [Models] </div>")
        
        gr.Markdown("<b> Notes: </b>")
        gr.Markdown("<b> - Low quality of the videos generated is expected, as the work focuses on visual-language alignment for behavior learning, not on video generation quality.</b>")
        gr.Markdown("<b> - The model is trained on small 64x64 images, and the videos are generated only from a small 512-dimensional embedding. </b>")
        gr.Markdown("<b> - Some prompts require styling instructions, e.g. fast, clean, in order to work well. See some of the examples. </b>")
        
        #######t2v#######
        with gr.Tab(label="Text2Video"):
            with gr.Column():
                with gr.Row(): # .style(equal_height=False)
                    with gr.Column():
                        input_text = gr.Text(label='prompt')
                        duration = gr.Slider(minimum=8, maximum=32, elem_id=f"duration", label="duration", value=16, step=8)
                        send_btn = gr.Button("Send")
                    with gr.Column(): # label='result',
                        pass
                    with gr.Column(): # label='result',
                        output_video_1 =  gr.Video(autoplay=True, width=256, height=256)
                with gr.Row():
                    gr.Examples(examples=t2v_examples,
                                inputs=[input_text,duration],
                                outputs=[output_video_1],
                                fn=text2video.get_prompt,
                                cache_examples=False)
                            #cache_examples=os.getenv('SYSTEM') == 'spaces')
            send_btn.click(
                fn=text2video.get_prompt, 
                inputs=[input_text,duration],
                outputs=[output_video_1],
            )
            input_text.submit(
                fn=text2video.get_prompt, 
                inputs=[input_text,duration],
                outputs=[output_video_1],
            )

    return videocrafter_iface

if __name__ == "__main__":
    result_dir = os.path.join('./', 'results')
    videocrafter_iface = videocrafter_demo(result_dir)
    videocrafter_iface.queue() # concurrency_count=1, max_size=10
    videocrafter_iface.launch()
    # videocrafter_iface.launch(server_name='0.0.0.0', server_port=80)