import os
import gradio as gr
import pandas as pd
BASELINE = f'Tune-A-Video (Baseline)'
COLS = ["Method", "CLIPScore (Frame Consistency) ⬆️", "CLIPScore (Text Alignment) ⬆️", "PickScore ⬆️", "Human Preference ⬆️", "References"]
TYPES = ["markdown", "number", "number", "number", "str", "markdown"]
def get_leaderboard():
all_data = []
baseline_0 = {
"Method": '**Tune-A-Video**',
"CLIPScore (Frame Consistency) ⬆️":0.92,
"CLIPScore (Text Alignment) ⬆️":27.12,
"PickScore ⬆️":20.36,
"Human Preference ⬆️":'',
"References": ','.join([f'Paper',
f'Code',
f'Website',
f'Demo'])
}
baseline_1 = {
"Method": 'VideoCrafter (todo)',
"References": ','.join([f'Code',
f'Demo'])
}
all_data += [baseline_0, baseline_1]
dataframe = pd.DataFrame.from_records(all_data)
dataframe = dataframe.sort_values(by=['PickScore ⬆️'], ascending=False)
print(dataframe)
dataframe = dataframe[COLS]
return dataframe
leaderboard = get_leaderboard()
def refresh():
return get_leaderboard()
def load_edited_video(source_video, *args):
result = source_video.split('/')[-1].split('.mp4')[0] + '-edit.mp4'
return os.path.join(os.path.dirname(__file__), f"files/{result}")
block = gr.Blocks()
with block:
with gr.Tab("Leaderboard"):
with gr.Row():
gr.Markdown(f"""
# 🤗 LOVEU-TGVE @ CVPR 2023 Leaderboard
Welcome to the Text-Guided Video Editing (TGVE) competition leaderboard of LOVEU Workshop @ CVPR 2023!
Leveraging AI for video editing has the potential to unleash creativity for artists across all skill levels. The rapidly-advancing field of Text-Guided Video Editing (TGVE) is here to address this challenge. Recent works in this field include Tune-A-Video, Gen-2, and Dreamix.
In this competition track, we provide a standard set of videos and prompts. As a researcher, you will develop a model that takes a video and a prompt for how to edit it, and your model will produce an edited video. For instance, you might be given a video of “a man is surfing inside the barrel of a wave,” and your model will edit the video to “a man is surfing on a wave made of aurora borealis.”
During the competition, evaluation results performed against the following 3 automatic metrics will be displayed on the leaderboard:
- CLIPScore (Frame Consistency) - the average cosine similarity between all pairs of CLIP image embeddings computed on all frames of output videos.
- CLIPScore (Text Alignment) - the average CLIP score between all frames of output videos and corresponding edited prompts.
- PickScore - the average PickScore between all frames of output videos.
After all submissions are uploaded, we will run a human-evaluation of all submitted videos. Specifically, we will have human labelers compare all submitted videos. Labelers will evaluate videos on the following criteria:
- Text alignment: How well does the generated video match the caption?
- Structure: How well does the generated video preserve the structure of the original video?
- Quality: Aesthetically, how good is this video?
We will choose a winner and a runner-up based on the human evaluation results.
The **bold** method name indicates that the implementation is **official** (by the author / developer of the original method).""")
with gr.Row():
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
datatype=TYPES, max_rows=10)
with gr.Row():
refresh_button = gr.Button("Refresh")
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
block.load(refresh, inputs=[], outputs=[leaderboard_table])
with gr.Tab("Baseline Demo"):
with gr.Row():
gr.Markdown(f"""Some examples generated by {BASELINE} are shown below.""")
with gr.Row():
with gr.Column():
source_video = gr.Video(type="file", label='Source Video', format="mp4", interactive=True)
source_prompt = gr.Textbox(label='Source Prompt',
# info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
max_lines=2,
placeholder='Example: "A cat in the grass in the sun."',
# value='A cat in the grass in the sun.'
)
with gr.Column():
result = gr.Video(type="file", label='Edited Video', format="mp4", interactive=True)
editing_prompt = gr.Textbox(label='Editing Prompt',
# info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
max_lines=2,
placeholder='Example: "A dog in the grass in the sun."',
# value='A dog in the grass in the sun.'
)
with gr.Row():
from example import examples
gr.Examples(examples=examples,
inputs=[source_video, source_prompt, editing_prompt],
outputs=result,
fn=load_edited_video,
cache_examples=True,
)
block.launch()