xiuyul's picture
initial files upload
c95d2d4
raw
history blame
7.1 kB
import os
import gradio as gr
import pandas as pd
BASELINE = f'<a target="_blank" href=https://github.com/showlab/loveu-tgve-2023 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">Tune-A-Video (Baseline)</a>'
COLS = ["Method", "CLIPScore (Frame Consistency) ⬆️", "CLIPScore (Text Alignment) ⬆️", "PickScore ⬆️", "Human Preference ⬆️", "References"]
TYPES = ["markdown", "number", "number", "number", "str", "markdown"]
def get_leaderboard():
all_data = []
baseline_0 = {
"Method": '**Tune-A-Video**',
"CLIPScore (Frame Consistency) ⬆️":0.92,
"CLIPScore (Text Alignment) ⬆️":27.12,
"PickScore ⬆️":20.36,
"Human Preference ⬆️":'',
"References": ','.join([f'<a target="_blank" href="https://arxiv.org/abs/2212.11565" style="color: blue">Paper</a>',
f'<a target="_blank" href="https://github.com/showlab/Tune-A-Video" style="color: blue">Code</a>',
f'<a target="_blank" href="https://tuneavideo.github.io/" style="color: blue">Website</a>',
f'<a target="_blank" href="https://huggingface.co/spaces/Tune-A-Video-library/Tune-A-Video-inference" style="color: blue">Demo</a>'])
}
baseline_1 = {
"Method": 'VideoCrafter (todo)',
"References": ','.join([f'<a target="_blank" href="https://github.com/VideoCrafter/VideoCrafter" style="color: blue">Code</a>',
f'<a target="_blank" href="https://huggingface.co/spaces/VideoCrafter/VideoCrafter" style="color: blue">Demo</a>'])
}
all_data += [baseline_0, baseline_1]
dataframe = pd.DataFrame.from_records(all_data)
dataframe = dataframe.sort_values(by=['PickScore ⬆️'], ascending=False)
print(dataframe)
dataframe = dataframe[COLS]
return dataframe
leaderboard = get_leaderboard()
def refresh():
return get_leaderboard()
def load_edited_video(source_video, *args):
result = source_video.split('/')[-1].split('.mp4')[0] + '-edit.mp4'
return os.path.join(os.path.dirname(__file__), f"files/{result}")
block = gr.Blocks()
with block:
with gr.Tab("Leaderboard"):
with gr.Row():
gr.Markdown(f"""
# 🤗 LOVEU-TGVE @ CVPR 2023 Leaderboard
<font size="4">
<b>Welcome to the <a href="https://sites.google.com/view/loveucvpr23/track4" target="_blank">Text-Guided Video Editing (TGVE)</a> competition leaderboard of <a href="https://sites.google.com/view/loveucvpr23/home" target="_blank">LOVEU Workshop @ CVPR 2023</a>!</b>
Leveraging AI for video editing has the potential to unleash creativity for artists across all skill levels. The rapidly-advancing field of Text-Guided Video Editing (TGVE) is here to address this challenge. Recent works in this field include <a href="https://tuneavideo.github.io/" target="_blank">Tune-A-Video</a>, <a href="https://research.runwayml.com/gen2" target="_blank">Gen-2</a>, and <a href="https://dreamix-video-editing.github.io/" target="_blank">Dreamix</a>.
In this competition track, we provide a standard set of videos and prompts. As a researcher, you will develop a model that takes a video and a prompt for how to edit it, and your model will produce an edited video. For instance, you might be given a video of “a man is surfing inside the barrel of a wave,” and your model will edit the video to “a man is surfing on a wave made of aurora borealis.”
During the competition, evaluation results performed against the following 3 automatic metrics will be displayed on the leaderboard:
- <a href="https://arxiv.org/abs/2103.00020" target="_blank">CLIPScore</a> (Frame Consistency) - the average cosine similarity between all pairs of CLIP image embeddings computed on all frames of output videos.
- <a href="https://arxiv.org/abs/2103.00020" target="_blank">CLIPScore</a> (Text Alignment) - the average CLIP score between all frames of output videos and corresponding edited prompts.
- <a href="https://arxiv.org/abs/2305.01569" target="_blank">PickScore</a> - the average PickScore between all frames of output videos.
After all submissions are uploaded, we will run a human-evaluation of all submitted videos. Specifically, we will have human labelers compare all submitted videos. Labelers will evaluate videos on the following criteria:
- Text alignment: How well does the generated video match the caption?
- Structure: How well does the generated video preserve the structure of the original video?
- Quality: Aesthetically, how good is this video?
We will choose a winner and a runner-up based on the human evaluation results.
</font>
The **bold** method name indicates that the implementation is **official** (by the author / developer of the original method).""")
with gr.Row():
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
datatype=TYPES, max_rows=10)
with gr.Row():
refresh_button = gr.Button("Refresh")
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
block.load(refresh, inputs=[], outputs=[leaderboard_table])
with gr.Tab("Baseline Demo"):
with gr.Row():
gr.Markdown(f"""Some examples generated by {BASELINE} are shown below.""")
with gr.Row():
with gr.Column():
source_video = gr.Video(type="file", label='Source Video', format="mp4", interactive=True)
source_prompt = gr.Textbox(label='Source Prompt',
# info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
max_lines=2,
placeholder='Example: "A cat in the grass in the sun."',
# value='A cat in the grass in the sun.'
)
with gr.Column():
result = gr.Video(type="file", label='Edited Video', format="mp4", interactive=True)
editing_prompt = gr.Textbox(label='Editing Prompt',
# info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
max_lines=2,
placeholder='Example: "A dog in the grass in the sun."',
# value='A dog in the grass in the sun.'
)
with gr.Row():
from example import examples
gr.Examples(examples=examples,
inputs=[source_video, source_prompt, editing_prompt],
outputs=result,
fn=load_edited_video,
cache_examples=True,
)
block.launch()