import gradio as gr import os import tempfile import subprocess # Define the function to call the command line script def process_video(uploaded_video_path, texts): with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile: output_video_path = tmpfile.name command = [ "python", "demo/video_demo_with_text.py", uploaded_video_path, "--out", output_video_path, "--masa_config", "configs/masa-gdino/masa_gdino_swinb_inference.py", "--masa_checkpoint", "saved_models/masa_models/gdino_masa.pth", "--texts", texts, "--score-thr", "0.2", "--unified", "--show_fps" ] subprocess.run(command, check=True) # Ensure the video is in a compatible format using ffmpeg converted_output_path = output_video_path.replace('.mp4', '_converted.mp4') ffmpeg_command = [ "ffmpeg", "-i", output_video_path, "-c:v", "mpeg4", "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", converted_output_path ] subprocess.run(ffmpeg_command, check=True) return converted_output_path css = """ #img-display-container { max-height: 100vh; } #img-display-input { max-height: 80vh; } #img-display-output { max-height: 80vh; } """ title = "# MASA Track Everything Demo" description = """ MASA + GroundingDINO on your video files! Please refer to our [paper](https://arxiv.org/abs/2406.04221), [project page](https://matchinganything.github.io/), or [github](https://github.com/siyuanliii/masa/tree/main?tab=readme-ov-file) for more details.""" with gr.Blocks(css=css) as demo: gr.Markdown(title) gr.Markdown(description) gr.Markdown("### Video Object Tracking demo") with gr.Row(): input_video = gr.Video(label="Input Video") input_texts = gr.Textbox(label="Input Texts") submit = gr.Button("Submit") processed_video = gr.Video(label="Processed Video") submit.click(process_video, inputs=[input_video, input_texts], outputs=processed_video) example_files = os.listdir('assets/examples_video') example_files.sort() example_files = [os.path.join('assets/examples_video', filename) for filename in example_files] examples = gr.Examples(examples=example_files, inputs=[input_video, input_texts], outputs=processed_video, fn=process_video, cache_examples=True) if __name__ == '__main__': demo.queue().launch()