Spaces:
Runtime error
Runtime error
DongfuJiang
commited on
Commit
·
b9d3de1
1
Parent(s):
0afd9ef
update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +92 -44
- examples/2006733/2006733_00.jpg +0 -0
- examples/2006733/2006733_01.jpg +0 -0
- examples/2006733/2006733_02.jpg +0 -0
- examples/2006733/2006733_03.jpg +0 -0
- examples/2006733/2006733_04.jpg +0 -0
- examples/2006733/2006733_05.jpg +0 -0
- examples/2006733/2006733_06.jpg +0 -0
- examples/2006733/2006733_07.jpg +0 -0
- examples/2006733/2006733_08.jpg +0 -0
- examples/2006733/2006733_09.jpg +0 -0
- examples/2006733/2006733_10.jpg +0 -0
- examples/2006733/2006733_11.jpg +0 -0
- examples/2006733/2006733_12.jpg +0 -0
- examples/2006733/2006733_13.jpg +0 -0
- examples/2006733/2006733_14.jpg +0 -0
- examples/2006733/2006733_15.jpg +0 -0
- examples/data_subset.json +368 -0
- examples/r000228/r000228_00.jpg +0 -0
- examples/r000228/r000228_01.jpg +0 -0
- examples/r000228/r000228_02.jpg +0 -0
- examples/r000228/r000228_03.jpg +0 -0
- examples/r000228/r000228_04.jpg +0 -0
- examples/r000228/r000228_05.jpg +0 -0
- examples/r000228/r000228_06.jpg +0 -0
- examples/r000228/r000228_07.jpg +0 -0
- examples/r000228/r000228_08.jpg +0 -0
- examples/r000228/r000228_09.jpg +0 -0
- examples/r000228/r000228_10.jpg +0 -0
- examples/r000228/r000228_11.jpg +0 -0
- examples/r000228/r000228_12.jpg +0 -0
- examples/r000228/r000228_13.jpg +0 -0
- examples/r000228/r000228_14.jpg +0 -0
- examples/r000228/r000228_15.jpg +0 -0
- examples/r000228/r000228_16.jpg +0 -0
- examples/r000228/r000228_17.jpg +0 -0
- examples/r000228/r000228_18.jpg +0 -0
- examples/r000228/r000228_19.jpg +0 -0
- examples/r000228/r000228_20.jpg +0 -0
- examples/r000228/r000228_21.jpg +0 -0
- examples/r000228/r000228_22.jpg +0 -0
- examples/r000228/r000228_23.jpg +0 -0
- examples/r000688/r000688_00.jpg +0 -0
- examples/r000688/r000688_01.jpg +0 -0
- examples/r000688/r000688_02.jpg +0 -0
- examples/r000688/r000688_03.jpg +0 -0
- examples/r000688/r000688_04.jpg +0 -0
- examples/r000688/r000688_05.jpg +0 -0
- examples/r000688/r000688_06.jpg +0 -0
- examples/r000688/r000688_07.jpg +0 -0
app.py
CHANGED
@@ -2,33 +2,81 @@ import gradio as gr
|
|
2 |
import spaces
|
3 |
import os
|
4 |
import time
|
|
|
5 |
from PIL import Image
|
6 |
import functools
|
7 |
-
from
|
8 |
from models.conversation import conv_templates
|
9 |
from typing import List
|
10 |
-
processor =
|
11 |
-
model =
|
12 |
-
conv_template = conv_templates[
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
global processor, model
|
17 |
-
model = model.to("cuda")
|
18 |
-
if not images:
|
19 |
-
images = None
|
20 |
-
for text, history in chat_mllava_stream(text, images, model, processor, history=history, **kwargs):
|
21 |
-
yield text
|
22 |
|
23 |
-
|
|
|
|
|
24 |
|
|
|
25 |
@spaces.GPU
|
26 |
def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
|
27 |
global processor, model
|
28 |
-
model = model.to("cuda")
|
29 |
if not images:
|
30 |
images = None
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
return generated_text
|
33 |
|
34 |
def enable_next_image(uploaded_images, image):
|
@@ -72,7 +120,6 @@ def get_chat_images(history):
|
|
72 |
|
73 |
|
74 |
def bot(history):
|
75 |
-
print(history)
|
76 |
cur_messages = {"text": "", "images": []}
|
77 |
for message in history[::-1]:
|
78 |
if message[1]:
|
@@ -87,7 +134,7 @@ def bot(history):
|
|
87 |
raise gr.Error("Please enter a message")
|
88 |
if cur_messages['text'].count("<image>") < len(cur_messages['images']):
|
89 |
gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
|
90 |
-
cur_messages['text']
|
91 |
history[-1][0] = cur_messages["text"]
|
92 |
if cur_messages['text'].count("<image>") > len(cur_messages['images']):
|
93 |
gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
|
@@ -105,13 +152,24 @@ def bot(history):
|
|
105 |
"do_sample": False
|
106 |
}
|
107 |
|
108 |
-
response =
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
def build_demo():
|
117 |
with gr.Blocks() as demo:
|
@@ -166,31 +224,21 @@ Mantis is a multimodal conversational AI model that can chat with users about im
|
|
166 |
).then(
|
167 |
bot, chatbot, chatbot, api_name="bot_response"
|
168 |
)
|
|
|
|
|
169 |
|
170 |
gr.Examples(
|
171 |
examples=[
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
{
|
181 |
-
"text": "<image> <image> Which image shows an older dog?",
|
182 |
-
"files": ["./examples/image8.jpg", "./examples/image9.jpg"]
|
183 |
-
},
|
184 |
-
{
|
185 |
-
"text": "Write a description for the given image sequence in a single paragraph, what is happening in this episode?",
|
186 |
-
"files": ["./examples/image3.jpg", "./examples/image4.jpg", "./examples/image5.jpg", "./examples/image6.jpg", "./examples/image7.jpg"]
|
187 |
-
},
|
188 |
-
{
|
189 |
-
"text": "<image> <image> How many dices are there in image 1 and image 2 respectively?",
|
190 |
-
"files": ["./examples/image10.jpg", "./examples/image15.jpg"]
|
191 |
-
},
|
192 |
],
|
193 |
-
inputs=[chat_input],
|
194 |
)
|
195 |
|
196 |
gr.Markdown("""
|
|
|
2 |
import spaces
|
3 |
import os
|
4 |
import time
|
5 |
+
import json
|
6 |
from PIL import Image
|
7 |
import functools
|
8 |
+
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
|
9 |
from models.conversation import conv_templates
|
10 |
from typing import List
|
11 |
+
processor = AutoProcessor.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
|
12 |
+
model = Idefics2ForConditionalGeneration.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
|
13 |
+
conv_template = conv_templates["idefics_2"]
|
14 |
|
15 |
+
with open("./examples/data_subset.json", 'r') as f:
|
16 |
+
examples = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
for item in examples:
|
19 |
+
video_id = item['images'][0].split("_")[0]
|
20 |
+
item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
|
21 |
|
22 |
+
prompt = "Suppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the beautiful girl, long hair,walk on the sity street, red cloth ,\nall the frames of video are as follows: \n\n"
|
23 |
@spaces.GPU
|
24 |
def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
|
25 |
global processor, model
|
26 |
+
model = model.to("cuda") if model.device.type != "cuda" else model
|
27 |
if not images:
|
28 |
images = None
|
29 |
+
|
30 |
+
user_role = conv_template.roles[0]
|
31 |
+
assistant_role = conv_template.roles[1]
|
32 |
+
|
33 |
+
idefics_2_message = []
|
34 |
+
cur_img_idx = 0
|
35 |
+
|
36 |
+
print(history)
|
37 |
+
for i, message in enumerate(history):
|
38 |
+
if message["role"] == user_role:
|
39 |
+
idefics_2_message.append({
|
40 |
+
"role": user_role,
|
41 |
+
"content": []
|
42 |
+
})
|
43 |
+
message_text = message["text"]
|
44 |
+
num_image_tokens_in_text = message_text.count("<image>")
|
45 |
+
if num_image_tokens_in_text > 0:
|
46 |
+
sub_texts = [x.strip() for x in message_text.split("<image>")]
|
47 |
+
if sub_texts[0]:
|
48 |
+
idefics_2_message[-1]["content"].append({"type": "text", "text": sub_texts[0]})
|
49 |
+
for sub_text in sub_texts[1:]:
|
50 |
+
idefics_2_message[-1]["content"].append({"type": "image"})
|
51 |
+
if sub_text:
|
52 |
+
idefics_2_message.append({
|
53 |
+
"role": user_role,
|
54 |
+
"content": [{"type": "text", "text": sub_text}]
|
55 |
+
})
|
56 |
+
else:
|
57 |
+
idefics_2_message[-1]["content"].append({"type": "text", "text": message_text})
|
58 |
+
elif message["role"] == assistant_role:
|
59 |
+
if i == len(history) - 1 and not message["text"]:
|
60 |
+
break
|
61 |
+
idefics_2_message.append({
|
62 |
+
"role": assistant_role,
|
63 |
+
"content": [{"type": "text", "text": message["text"]}]
|
64 |
+
})
|
65 |
+
if text:
|
66 |
+
assert idefics_2_message[-1]["role"] == assistant_role and not idefics_2_message[-1]["content"], "Internal error"
|
67 |
+
idefics_2_message.append({
|
68 |
+
"role": user_role,
|
69 |
+
"content": [{"type": "text", "text": text}]
|
70 |
+
})
|
71 |
+
|
72 |
+
print(idefics_2_message)
|
73 |
+
prompt = processor.apply_chat_template(idefics_2_message, add_generation_prompt=True)
|
74 |
+
|
75 |
+
images = [Image.open(x) for x in images]
|
76 |
+
inputs = processor(text=prompt, images=images, return_tensors="pt")
|
77 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
78 |
+
outputs = model.generate(**inputs, max_new_tokens=1024)
|
79 |
+
generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
|
80 |
return generated_text
|
81 |
|
82 |
def enable_next_image(uploaded_images, image):
|
|
|
120 |
|
121 |
|
122 |
def bot(history):
|
|
|
123 |
cur_messages = {"text": "", "images": []}
|
124 |
for message in history[::-1]:
|
125 |
if message[1]:
|
|
|
134 |
raise gr.Error("Please enter a message")
|
135 |
if cur_messages['text'].count("<image>") < len(cur_messages['images']):
|
136 |
gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
|
137 |
+
cur_messages['text'] += "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>"))
|
138 |
history[-1][0] = cur_messages["text"]
|
139 |
if cur_messages['text'].count("<image>") > len(cur_messages['images']):
|
140 |
gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
|
|
|
152 |
"do_sample": False
|
153 |
}
|
154 |
|
155 |
+
response = generate(None, chat_images, chat_history, **generation_kwargs)
|
156 |
+
return response
|
157 |
+
# for _output in response:
|
158 |
+
# history[-1][1] = _output
|
159 |
+
# time.sleep(0.05)
|
160 |
+
# yield history
|
161 |
|
162 |
+
def get_images(video_folder:str):
|
163 |
+
"""
|
164 |
+
video folder contains images files like {video_folder_name}_00.jpg, {video_folder_name}_01.jpg, ...
|
165 |
+
"""
|
166 |
+
images = []
|
167 |
+
for file in os.listdir(video_folder):
|
168 |
+
if file.endswith(".jpg"):
|
169 |
+
images.append(Image.open(os.path.join(video_folder, file)))
|
170 |
+
# sort images by name
|
171 |
+
images.sort(key=lambda x: int(x.filename.split("_")[-1].split(".")[0]))
|
172 |
+
return images
|
173 |
|
174 |
def build_demo():
|
175 |
with gr.Blocks() as demo:
|
|
|
224 |
).then(
|
225 |
bot, chatbot, chatbot, api_name="bot_response"
|
226 |
)
|
227 |
+
dummy_id = gr.Textbox("dummy_id", label="dummy_id", visible=False)
|
228 |
+
dummy_output = gr.Textbox("dummy_output", label="dummy_output", visible=False)
|
229 |
|
230 |
gr.Examples(
|
231 |
examples=[
|
232 |
+
[
|
233 |
+
item['id'],
|
234 |
+
{
|
235 |
+
"text": item['conversations'][0]['value'],
|
236 |
+
"files": item['images']
|
237 |
+
},
|
238 |
+
item['conversations'][1]['value']
|
239 |
+
] for item in examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
],
|
241 |
+
inputs=[dummy_id, chat_input, dummy_output],
|
242 |
)
|
243 |
|
244 |
gr.Markdown("""
|
examples/2006733/2006733_00.jpg
ADDED
examples/2006733/2006733_01.jpg
ADDED
examples/2006733/2006733_02.jpg
ADDED
examples/2006733/2006733_03.jpg
ADDED
examples/2006733/2006733_04.jpg
ADDED
examples/2006733/2006733_05.jpg
ADDED
examples/2006733/2006733_06.jpg
ADDED
examples/2006733/2006733_07.jpg
ADDED
examples/2006733/2006733_08.jpg
ADDED
examples/2006733/2006733_09.jpg
ADDED
examples/2006733/2006733_10.jpg
ADDED
examples/2006733/2006733_11.jpg
ADDED
examples/2006733/2006733_12.jpg
ADDED
examples/2006733/2006733_13.jpg
ADDED
examples/2006733/2006733_14.jpg
ADDED
examples/2006733/2006733_15.jpg
ADDED
examples/data_subset.json
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"id": "r003679",
|
4 |
+
"images": [
|
5 |
+
"r003679_00.jpg",
|
6 |
+
"r003679_01.jpg",
|
7 |
+
"r003679_02.jpg",
|
8 |
+
"r003679_03.jpg",
|
9 |
+
"r003679_04.jpg",
|
10 |
+
"r003679_05.jpg",
|
11 |
+
"r003679_06.jpg",
|
12 |
+
"r003679_07.jpg",
|
13 |
+
"r003679_08.jpg",
|
14 |
+
"r003679_09.jpg",
|
15 |
+
"r003679_10.jpg",
|
16 |
+
"r003679_11.jpg",
|
17 |
+
"r003679_12.jpg",
|
18 |
+
"r003679_13.jpg",
|
19 |
+
"r003679_14.jpg",
|
20 |
+
"r003679_15.jpg",
|
21 |
+
"r003679_16.jpg",
|
22 |
+
"r003679_17.jpg",
|
23 |
+
"r003679_18.jpg",
|
24 |
+
"r003679_19.jpg",
|
25 |
+
"r003679_20.jpg",
|
26 |
+
"r003679_21.jpg",
|
27 |
+
"r003679_22.jpg",
|
28 |
+
"r003679_23.jpg"
|
29 |
+
],
|
30 |
+
"conversations": [
|
31 |
+
{
|
32 |
+
"from": "human",
|
33 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"from": "gpt",
|
37 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
38 |
+
}
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"id": "r100916",
|
43 |
+
"images": [
|
44 |
+
"r100916_00.jpg",
|
45 |
+
"r100916_01.jpg",
|
46 |
+
"r100916_02.jpg",
|
47 |
+
"r100916_03.jpg",
|
48 |
+
"r100916_04.jpg",
|
49 |
+
"r100916_05.jpg",
|
50 |
+
"r100916_06.jpg",
|
51 |
+
"r100916_07.jpg",
|
52 |
+
"r100916_08.jpg",
|
53 |
+
"r100916_09.jpg",
|
54 |
+
"r100916_10.jpg",
|
55 |
+
"r100916_11.jpg",
|
56 |
+
"r100916_12.jpg",
|
57 |
+
"r100916_13.jpg",
|
58 |
+
"r100916_14.jpg",
|
59 |
+
"r100916_15.jpg"
|
60 |
+
],
|
61 |
+
"conversations": [
|
62 |
+
{
|
63 |
+
"from": "human",
|
64 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"from": "gpt",
|
68 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
69 |
+
}
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"id": "r004061",
|
74 |
+
"images": [
|
75 |
+
"r004061_00.jpg",
|
76 |
+
"r004061_01.jpg",
|
77 |
+
"r004061_02.jpg",
|
78 |
+
"r004061_03.jpg",
|
79 |
+
"r004061_04.jpg",
|
80 |
+
"r004061_05.jpg",
|
81 |
+
"r004061_06.jpg",
|
82 |
+
"r004061_07.jpg",
|
83 |
+
"r004061_08.jpg",
|
84 |
+
"r004061_09.jpg",
|
85 |
+
"r004061_10.jpg",
|
86 |
+
"r004061_11.jpg",
|
87 |
+
"r004061_12.jpg",
|
88 |
+
"r004061_13.jpg",
|
89 |
+
"r004061_14.jpg",
|
90 |
+
"r004061_15.jpg",
|
91 |
+
"r004061_16.jpg",
|
92 |
+
"r004061_17.jpg",
|
93 |
+
"r004061_18.jpg",
|
94 |
+
"r004061_19.jpg",
|
95 |
+
"r004061_20.jpg",
|
96 |
+
"r004061_21.jpg",
|
97 |
+
"r004061_22.jpg",
|
98 |
+
"r004061_23.jpg"
|
99 |
+
],
|
100 |
+
"conversations": [
|
101 |
+
{
|
102 |
+
"from": "human",
|
103 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"from": "gpt",
|
107 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
108 |
+
}
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"id": "r000228",
|
113 |
+
"images": [
|
114 |
+
"r000228_00.jpg",
|
115 |
+
"r000228_01.jpg",
|
116 |
+
"r000228_02.jpg",
|
117 |
+
"r000228_03.jpg",
|
118 |
+
"r000228_04.jpg",
|
119 |
+
"r000228_05.jpg",
|
120 |
+
"r000228_06.jpg",
|
121 |
+
"r000228_07.jpg",
|
122 |
+
"r000228_08.jpg",
|
123 |
+
"r000228_09.jpg",
|
124 |
+
"r000228_10.jpg",
|
125 |
+
"r000228_11.jpg",
|
126 |
+
"r000228_12.jpg",
|
127 |
+
"r000228_13.jpg",
|
128 |
+
"r000228_14.jpg",
|
129 |
+
"r000228_15.jpg",
|
130 |
+
"r000228_16.jpg",
|
131 |
+
"r000228_17.jpg",
|
132 |
+
"r000228_18.jpg",
|
133 |
+
"r000228_19.jpg",
|
134 |
+
"r000228_20.jpg",
|
135 |
+
"r000228_21.jpg",
|
136 |
+
"r000228_22.jpg",
|
137 |
+
"r000228_23.jpg"
|
138 |
+
],
|
139 |
+
"conversations": [
|
140 |
+
{
|
141 |
+
"from": "human",
|
142 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is two people get into sports car,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"from": "gpt",
|
146 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
147 |
+
}
|
148 |
+
]
|
149 |
+
},
|
150 |
+
{
|
151 |
+
"id": "r100887",
|
152 |
+
"images": [
|
153 |
+
"r100887_00.jpg",
|
154 |
+
"r100887_01.jpg",
|
155 |
+
"r100887_02.jpg",
|
156 |
+
"r100887_03.jpg",
|
157 |
+
"r100887_04.jpg",
|
158 |
+
"r100887_05.jpg",
|
159 |
+
"r100887_06.jpg",
|
160 |
+
"r100887_07.jpg",
|
161 |
+
"r100887_08.jpg",
|
162 |
+
"r100887_09.jpg",
|
163 |
+
"r100887_10.jpg",
|
164 |
+
"r100887_11.jpg",
|
165 |
+
"r100887_12.jpg",
|
166 |
+
"r100887_13.jpg",
|
167 |
+
"r100887_14.jpg",
|
168 |
+
"r100887_15.jpg"
|
169 |
+
],
|
170 |
+
"conversations": [
|
171 |
+
{
|
172 |
+
"from": "human",
|
173 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A person using a spatula to spread something on a pan.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"from": "gpt",
|
177 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
178 |
+
}
|
179 |
+
]
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"id": "r104340",
|
183 |
+
"images": [
|
184 |
+
"r104340_00.jpg",
|
185 |
+
"r104340_01.jpg",
|
186 |
+
"r104340_02.jpg",
|
187 |
+
"r104340_03.jpg",
|
188 |
+
"r104340_04.jpg",
|
189 |
+
"r104340_05.jpg",
|
190 |
+
"r104340_06.jpg",
|
191 |
+
"r104340_07.jpg",
|
192 |
+
"r104340_08.jpg",
|
193 |
+
"r104340_09.jpg",
|
194 |
+
"r104340_10.jpg",
|
195 |
+
"r104340_11.jpg",
|
196 |
+
"r104340_12.jpg",
|
197 |
+
"r104340_13.jpg",
|
198 |
+
"r104340_14.jpg",
|
199 |
+
"r104340_15.jpg"
|
200 |
+
],
|
201 |
+
"conversations": [
|
202 |
+
{
|
203 |
+
"from": "human",
|
204 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A bartender is making an alcoholic drink by pouring a clear liquid into a glass with ice cubes.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"from": "gpt",
|
208 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
209 |
+
}
|
210 |
+
]
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"id": "r001599",
|
214 |
+
"images": [
|
215 |
+
"r001599_00.jpg",
|
216 |
+
"r001599_01.jpg",
|
217 |
+
"r001599_02.jpg",
|
218 |
+
"r001599_03.jpg",
|
219 |
+
"r001599_04.jpg",
|
220 |
+
"r001599_05.jpg",
|
221 |
+
"r001599_06.jpg",
|
222 |
+
"r001599_07.jpg",
|
223 |
+
"r001599_08.jpg",
|
224 |
+
"r001599_09.jpg",
|
225 |
+
"r001599_10.jpg",
|
226 |
+
"r001599_11.jpg",
|
227 |
+
"r001599_12.jpg",
|
228 |
+
"r001599_13.jpg",
|
229 |
+
"r001599_14.jpg",
|
230 |
+
"r001599_15.jpg",
|
231 |
+
"r001599_16.jpg",
|
232 |
+
"r001599_17.jpg",
|
233 |
+
"r001599_18.jpg",
|
234 |
+
"r001599_19.jpg",
|
235 |
+
"r001599_20.jpg",
|
236 |
+
"r001599_21.jpg",
|
237 |
+
"r001599_22.jpg",
|
238 |
+
"r001599_23.jpg"
|
239 |
+
],
|
240 |
+
"conversations": [
|
241 |
+
{
|
242 |
+
"from": "human",
|
243 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man uses a camera,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"from": "gpt",
|
247 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
248 |
+
}
|
249 |
+
]
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"id": "r101298",
|
253 |
+
"images": [
|
254 |
+
"r101298_00.jpg",
|
255 |
+
"r101298_01.jpg",
|
256 |
+
"r101298_02.jpg",
|
257 |
+
"r101298_03.jpg",
|
258 |
+
"r101298_04.jpg",
|
259 |
+
"r101298_05.jpg",
|
260 |
+
"r101298_06.jpg",
|
261 |
+
"r101298_07.jpg",
|
262 |
+
"r101298_08.jpg",
|
263 |
+
"r101298_09.jpg",
|
264 |
+
"r101298_10.jpg",
|
265 |
+
"r101298_11.jpg",
|
266 |
+
"r101298_12.jpg",
|
267 |
+
"r101298_13.jpg",
|
268 |
+
"r101298_14.jpg",
|
269 |
+
"r101298_15.jpg",
|
270 |
+
"r101298_16.jpg",
|
271 |
+
"r101298_17.jpg",
|
272 |
+
"r101298_18.jpg",
|
273 |
+
"r101298_19.jpg",
|
274 |
+
"r101298_20.jpg",
|
275 |
+
"r101298_21.jpg",
|
276 |
+
"r101298_22.jpg",
|
277 |
+
"r101298_23.jpg"
|
278 |
+
],
|
279 |
+
"conversations": [
|
280 |
+
{
|
281 |
+
"from": "human",
|
282 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A baseball game is being played in a stadium with a large crowd of people watching.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"from": "gpt",
|
286 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
287 |
+
}
|
288 |
+
]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"id": "r000688",
|
292 |
+
"images": [
|
293 |
+
"r000688_00.jpg",
|
294 |
+
"r000688_01.jpg",
|
295 |
+
"r000688_02.jpg",
|
296 |
+
"r000688_03.jpg",
|
297 |
+
"r000688_04.jpg",
|
298 |
+
"r000688_05.jpg",
|
299 |
+
"r000688_06.jpg",
|
300 |
+
"r000688_07.jpg",
|
301 |
+
"r000688_08.jpg",
|
302 |
+
"r000688_09.jpg",
|
303 |
+
"r000688_10.jpg",
|
304 |
+
"r000688_11.jpg",
|
305 |
+
"r000688_12.jpg",
|
306 |
+
"r000688_13.jpg",
|
307 |
+
"r000688_14.jpg",
|
308 |
+
"r000688_15.jpg",
|
309 |
+
"r000688_16.jpg",
|
310 |
+
"r000688_17.jpg",
|
311 |
+
"r000688_18.jpg",
|
312 |
+
"r000688_19.jpg",
|
313 |
+
"r000688_20.jpg",
|
314 |
+
"r000688_21.jpg",
|
315 |
+
"r000688_22.jpg",
|
316 |
+
"r000688_23.jpg"
|
317 |
+
],
|
318 |
+
"conversations": [
|
319 |
+
{
|
320 |
+
"from": "human",
|
321 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the child furthest to the right dips her hand into the bowl shared by the other children.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"from": "gpt",
|
325 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
326 |
+
}
|
327 |
+
]
|
328 |
+
},
|
329 |
+
{
|
330 |
+
"id": "r104350",
|
331 |
+
"images": [
|
332 |
+
"r104350_00.jpg",
|
333 |
+
"r104350_01.jpg",
|
334 |
+
"r104350_02.jpg",
|
335 |
+
"r104350_03.jpg",
|
336 |
+
"r104350_04.jpg",
|
337 |
+
"r104350_05.jpg",
|
338 |
+
"r104350_06.jpg",
|
339 |
+
"r104350_07.jpg",
|
340 |
+
"r104350_08.jpg",
|
341 |
+
"r104350_09.jpg",
|
342 |
+
"r104350_10.jpg",
|
343 |
+
"r104350_11.jpg",
|
344 |
+
"r104350_12.jpg",
|
345 |
+
"r104350_13.jpg",
|
346 |
+
"r104350_14.jpg",
|
347 |
+
"r104350_15.jpg",
|
348 |
+
"r104350_16.jpg",
|
349 |
+
"r104350_17.jpg",
|
350 |
+
"r104350_18.jpg",
|
351 |
+
"r104350_19.jpg",
|
352 |
+
"r104350_20.jpg",
|
353 |
+
"r104350_21.jpg",
|
354 |
+
"r104350_22.jpg",
|
355 |
+
"r104350_23.jpg"
|
356 |
+
],
|
357 |
+
"conversations": [
|
358 |
+
{
|
359 |
+
"from": "human",
|
360 |
+
"value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is talking to a young woman in a room with books on the shelves.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"from": "gpt",
|
364 |
+
"value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
|
365 |
+
}
|
366 |
+
]
|
367 |
+
}
|
368 |
+
]
|
examples/r000228/r000228_00.jpg
ADDED
examples/r000228/r000228_01.jpg
ADDED
examples/r000228/r000228_02.jpg
ADDED
examples/r000228/r000228_03.jpg
ADDED
examples/r000228/r000228_04.jpg
ADDED
examples/r000228/r000228_05.jpg
ADDED
examples/r000228/r000228_06.jpg
ADDED
examples/r000228/r000228_07.jpg
ADDED
examples/r000228/r000228_08.jpg
ADDED
examples/r000228/r000228_09.jpg
ADDED
examples/r000228/r000228_10.jpg
ADDED
examples/r000228/r000228_11.jpg
ADDED
examples/r000228/r000228_12.jpg
ADDED
examples/r000228/r000228_13.jpg
ADDED
examples/r000228/r000228_14.jpg
ADDED
examples/r000228/r000228_15.jpg
ADDED
examples/r000228/r000228_16.jpg
ADDED
examples/r000228/r000228_17.jpg
ADDED
examples/r000228/r000228_18.jpg
ADDED
examples/r000228/r000228_19.jpg
ADDED
examples/r000228/r000228_20.jpg
ADDED
examples/r000228/r000228_21.jpg
ADDED
examples/r000228/r000228_22.jpg
ADDED
examples/r000228/r000228_23.jpg
ADDED
examples/r000688/r000688_00.jpg
ADDED
examples/r000688/r000688_01.jpg
ADDED
examples/r000688/r000688_02.jpg
ADDED
examples/r000688/r000688_03.jpg
ADDED
examples/r000688/r000688_04.jpg
ADDED
examples/r000688/r000688_05.jpg
ADDED
examples/r000688/r000688_06.jpg
ADDED
examples/r000688/r000688_07.jpg
ADDED