DongfuJiang commited on
Commit
b9d3de1
·
1 Parent(s): 0afd9ef
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +92 -44
  2. examples/2006733/2006733_00.jpg +0 -0
  3. examples/2006733/2006733_01.jpg +0 -0
  4. examples/2006733/2006733_02.jpg +0 -0
  5. examples/2006733/2006733_03.jpg +0 -0
  6. examples/2006733/2006733_04.jpg +0 -0
  7. examples/2006733/2006733_05.jpg +0 -0
  8. examples/2006733/2006733_06.jpg +0 -0
  9. examples/2006733/2006733_07.jpg +0 -0
  10. examples/2006733/2006733_08.jpg +0 -0
  11. examples/2006733/2006733_09.jpg +0 -0
  12. examples/2006733/2006733_10.jpg +0 -0
  13. examples/2006733/2006733_11.jpg +0 -0
  14. examples/2006733/2006733_12.jpg +0 -0
  15. examples/2006733/2006733_13.jpg +0 -0
  16. examples/2006733/2006733_14.jpg +0 -0
  17. examples/2006733/2006733_15.jpg +0 -0
  18. examples/data_subset.json +368 -0
  19. examples/r000228/r000228_00.jpg +0 -0
  20. examples/r000228/r000228_01.jpg +0 -0
  21. examples/r000228/r000228_02.jpg +0 -0
  22. examples/r000228/r000228_03.jpg +0 -0
  23. examples/r000228/r000228_04.jpg +0 -0
  24. examples/r000228/r000228_05.jpg +0 -0
  25. examples/r000228/r000228_06.jpg +0 -0
  26. examples/r000228/r000228_07.jpg +0 -0
  27. examples/r000228/r000228_08.jpg +0 -0
  28. examples/r000228/r000228_09.jpg +0 -0
  29. examples/r000228/r000228_10.jpg +0 -0
  30. examples/r000228/r000228_11.jpg +0 -0
  31. examples/r000228/r000228_12.jpg +0 -0
  32. examples/r000228/r000228_13.jpg +0 -0
  33. examples/r000228/r000228_14.jpg +0 -0
  34. examples/r000228/r000228_15.jpg +0 -0
  35. examples/r000228/r000228_16.jpg +0 -0
  36. examples/r000228/r000228_17.jpg +0 -0
  37. examples/r000228/r000228_18.jpg +0 -0
  38. examples/r000228/r000228_19.jpg +0 -0
  39. examples/r000228/r000228_20.jpg +0 -0
  40. examples/r000228/r000228_21.jpg +0 -0
  41. examples/r000228/r000228_22.jpg +0 -0
  42. examples/r000228/r000228_23.jpg +0 -0
  43. examples/r000688/r000688_00.jpg +0 -0
  44. examples/r000688/r000688_01.jpg +0 -0
  45. examples/r000688/r000688_02.jpg +0 -0
  46. examples/r000688/r000688_03.jpg +0 -0
  47. examples/r000688/r000688_04.jpg +0 -0
  48. examples/r000688/r000688_05.jpg +0 -0
  49. examples/r000688/r000688_06.jpg +0 -0
  50. examples/r000688/r000688_07.jpg +0 -0
app.py CHANGED
@@ -2,33 +2,81 @@ import gradio as gr
2
  import spaces
3
  import os
4
  import time
 
5
  from PIL import Image
6
  import functools
7
- from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava_stream, MLlavaForConditionalGeneration, chat_mllava
8
  from models.conversation import conv_templates
9
  from typing import List
10
- processor = MLlavaProcessor.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
11
- model = LlavaForConditionalGeneration.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
12
- conv_template = conv_templates['llama_3']
13
 
14
- @spaces.GPU
15
- def generate_stream(text:str, images:List[Image.Image], history: List[dict], **kwargs):
16
- global processor, model
17
- model = model.to("cuda")
18
- if not images:
19
- images = None
20
- for text, history in chat_mllava_stream(text, images, model, processor, history=history, **kwargs):
21
- yield text
22
 
23
- return text
 
 
24
 
 
25
  @spaces.GPU
26
  def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
27
  global processor, model
28
- model = model.to("cuda")
29
  if not images:
30
  images = None
31
- generated_text, history = chat_mllava(text, images, model, processor, history=history, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  return generated_text
33
 
34
  def enable_next_image(uploaded_images, image):
@@ -72,7 +120,6 @@ def get_chat_images(history):
72
 
73
 
74
  def bot(history):
75
- print(history)
76
  cur_messages = {"text": "", "images": []}
77
  for message in history[::-1]:
78
  if message[1]:
@@ -87,7 +134,7 @@ def bot(history):
87
  raise gr.Error("Please enter a message")
88
  if cur_messages['text'].count("<image>") < len(cur_messages['images']):
89
  gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
90
- cur_messages['text'] = "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>")) + cur_messages['text']
91
  history[-1][0] = cur_messages["text"]
92
  if cur_messages['text'].count("<image>") > len(cur_messages['images']):
93
  gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
@@ -105,13 +152,24 @@ def bot(history):
105
  "do_sample": False
106
  }
107
 
108
- response = generate_stream(None, chat_images, chat_history, **generation_kwargs)
109
- for _output in response:
110
- history[-1][1] = _output
111
- time.sleep(0.05)
112
- yield history
113
-
114
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def build_demo():
117
  with gr.Blocks() as demo:
@@ -166,31 +224,21 @@ Mantis is a multimodal conversational AI model that can chat with users about im
166
  ).then(
167
  bot, chatbot, chatbot, api_name="bot_response"
168
  )
 
 
169
 
170
  gr.Examples(
171
  examples=[
172
- {
173
- "text": "<image> <image> <image> Which image shows a different mood of character from the others?",
174
- "files": ["./examples/image12.jpg", "./examples/image13.jpg", "./examples/image14.jpg"]
175
- },
176
- {
177
- "text": "<image> <image> What's the difference between these two images? Please describe as much as you can.",
178
- "files": ["./examples/image1.jpg", "./examples/image2.jpg"]
179
- },
180
- {
181
- "text": "<image> <image> Which image shows an older dog?",
182
- "files": ["./examples/image8.jpg", "./examples/image9.jpg"]
183
- },
184
- {
185
- "text": "Write a description for the given image sequence in a single paragraph, what is happening in this episode?",
186
- "files": ["./examples/image3.jpg", "./examples/image4.jpg", "./examples/image5.jpg", "./examples/image6.jpg", "./examples/image7.jpg"]
187
- },
188
- {
189
- "text": "<image> <image> How many dices are there in image 1 and image 2 respectively?",
190
- "files": ["./examples/image10.jpg", "./examples/image15.jpg"]
191
- },
192
  ],
193
- inputs=[chat_input],
194
  )
195
 
196
  gr.Markdown("""
 
2
  import spaces
3
  import os
4
  import time
5
+ import json
6
  from PIL import Image
7
  import functools
8
+ from transformers import AutoProcessor, Idefics2ForConditionalGeneration
9
  from models.conversation import conv_templates
10
  from typing import List
11
+ processor = AutoProcessor.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
12
+ model = Idefics2ForConditionalGeneration.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
13
+ conv_template = conv_templates["idefics_2"]
14
 
15
+ with open("./examples/data_subset.json", 'r') as f:
16
+ examples = json.load(f)
 
 
 
 
 
 
17
 
18
+ for item in examples:
19
+ video_id = item['images'][0].split("_")[0]
20
+ item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
21
 
22
+ prompt = "Suppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the beautiful girl, long hair,walk on the sity street, red cloth ,\nall the frames of video are as follows: \n\n"
23
  @spaces.GPU
24
  def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
25
  global processor, model
26
+ model = model.to("cuda") if model.device.type != "cuda" else model
27
  if not images:
28
  images = None
29
+
30
+ user_role = conv_template.roles[0]
31
+ assistant_role = conv_template.roles[1]
32
+
33
+ idefics_2_message = []
34
+ cur_img_idx = 0
35
+
36
+ print(history)
37
+ for i, message in enumerate(history):
38
+ if message["role"] == user_role:
39
+ idefics_2_message.append({
40
+ "role": user_role,
41
+ "content": []
42
+ })
43
+ message_text = message["text"]
44
+ num_image_tokens_in_text = message_text.count("<image>")
45
+ if num_image_tokens_in_text > 0:
46
+ sub_texts = [x.strip() for x in message_text.split("<image>")]
47
+ if sub_texts[0]:
48
+ idefics_2_message[-1]["content"].append({"type": "text", "text": sub_texts[0]})
49
+ for sub_text in sub_texts[1:]:
50
+ idefics_2_message[-1]["content"].append({"type": "image"})
51
+ if sub_text:
52
+ idefics_2_message.append({
53
+ "role": user_role,
54
+ "content": [{"type": "text", "text": sub_text}]
55
+ })
56
+ else:
57
+ idefics_2_message[-1]["content"].append({"type": "text", "text": message_text})
58
+ elif message["role"] == assistant_role:
59
+ if i == len(history) - 1 and not message["text"]:
60
+ break
61
+ idefics_2_message.append({
62
+ "role": assistant_role,
63
+ "content": [{"type": "text", "text": message["text"]}]
64
+ })
65
+ if text:
66
+ assert idefics_2_message[-1]["role"] == assistant_role and not idefics_2_message[-1]["content"], "Internal error"
67
+ idefics_2_message.append({
68
+ "role": user_role,
69
+ "content": [{"type": "text", "text": text}]
70
+ })
71
+
72
+ print(idefics_2_message)
73
+ prompt = processor.apply_chat_template(idefics_2_message, add_generation_prompt=True)
74
+
75
+ images = [Image.open(x) for x in images]
76
+ inputs = processor(text=prompt, images=images, return_tensors="pt")
77
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
78
+ outputs = model.generate(**inputs, max_new_tokens=1024)
79
+ generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
80
  return generated_text
81
 
82
  def enable_next_image(uploaded_images, image):
 
120
 
121
 
122
  def bot(history):
 
123
  cur_messages = {"text": "", "images": []}
124
  for message in history[::-1]:
125
  if message[1]:
 
134
  raise gr.Error("Please enter a message")
135
  if cur_messages['text'].count("<image>") < len(cur_messages['images']):
136
  gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
137
+ cur_messages['text'] += "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>"))
138
  history[-1][0] = cur_messages["text"]
139
  if cur_messages['text'].count("<image>") > len(cur_messages['images']):
140
  gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
 
152
  "do_sample": False
153
  }
154
 
155
+ response = generate(None, chat_images, chat_history, **generation_kwargs)
156
+ return response
157
+ # for _output in response:
158
+ # history[-1][1] = _output
159
+ # time.sleep(0.05)
160
+ # yield history
161
 
162
+ def get_images(video_folder:str):
163
+ """
164
+ video folder contains images files like {video_folder_name}_00.jpg, {video_folder_name}_01.jpg, ...
165
+ """
166
+ images = []
167
+ for file in os.listdir(video_folder):
168
+ if file.endswith(".jpg"):
169
+ images.append(Image.open(os.path.join(video_folder, file)))
170
+ # sort images by name
171
+ images.sort(key=lambda x: int(x.filename.split("_")[-1].split(".")[0]))
172
+ return images
173
 
174
  def build_demo():
175
  with gr.Blocks() as demo:
 
224
  ).then(
225
  bot, chatbot, chatbot, api_name="bot_response"
226
  )
227
+ dummy_id = gr.Textbox("dummy_id", label="dummy_id", visible=False)
228
+ dummy_output = gr.Textbox("dummy_output", label="dummy_output", visible=False)
229
 
230
  gr.Examples(
231
  examples=[
232
+ [
233
+ item['id'],
234
+ {
235
+ "text": item['conversations'][0]['value'],
236
+ "files": item['images']
237
+ },
238
+ item['conversations'][1]['value']
239
+ ] for item in examples
 
 
 
 
 
 
 
 
 
 
 
 
240
  ],
241
+ inputs=[dummy_id, chat_input, dummy_output],
242
  )
243
 
244
  gr.Markdown("""
examples/2006733/2006733_00.jpg ADDED
examples/2006733/2006733_01.jpg ADDED
examples/2006733/2006733_02.jpg ADDED
examples/2006733/2006733_03.jpg ADDED
examples/2006733/2006733_04.jpg ADDED
examples/2006733/2006733_05.jpg ADDED
examples/2006733/2006733_06.jpg ADDED
examples/2006733/2006733_07.jpg ADDED
examples/2006733/2006733_08.jpg ADDED
examples/2006733/2006733_09.jpg ADDED
examples/2006733/2006733_10.jpg ADDED
examples/2006733/2006733_11.jpg ADDED
examples/2006733/2006733_12.jpg ADDED
examples/2006733/2006733_13.jpg ADDED
examples/2006733/2006733_14.jpg ADDED
examples/2006733/2006733_15.jpg ADDED
examples/data_subset.json ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "r003679",
4
+ "images": [
5
+ "r003679_00.jpg",
6
+ "r003679_01.jpg",
7
+ "r003679_02.jpg",
8
+ "r003679_03.jpg",
9
+ "r003679_04.jpg",
10
+ "r003679_05.jpg",
11
+ "r003679_06.jpg",
12
+ "r003679_07.jpg",
13
+ "r003679_08.jpg",
14
+ "r003679_09.jpg",
15
+ "r003679_10.jpg",
16
+ "r003679_11.jpg",
17
+ "r003679_12.jpg",
18
+ "r003679_13.jpg",
19
+ "r003679_14.jpg",
20
+ "r003679_15.jpg",
21
+ "r003679_16.jpg",
22
+ "r003679_17.jpg",
23
+ "r003679_18.jpg",
24
+ "r003679_19.jpg",
25
+ "r003679_20.jpg",
26
+ "r003679_21.jpg",
27
+ "r003679_22.jpg",
28
+ "r003679_23.jpg"
29
+ ],
30
+ "conversations": [
31
+ {
32
+ "from": "human",
33
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
34
+ },
35
+ {
36
+ "from": "gpt",
37
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
38
+ }
39
+ ]
40
+ },
41
+ {
42
+ "id": "r100916",
43
+ "images": [
44
+ "r100916_00.jpg",
45
+ "r100916_01.jpg",
46
+ "r100916_02.jpg",
47
+ "r100916_03.jpg",
48
+ "r100916_04.jpg",
49
+ "r100916_05.jpg",
50
+ "r100916_06.jpg",
51
+ "r100916_07.jpg",
52
+ "r100916_08.jpg",
53
+ "r100916_09.jpg",
54
+ "r100916_10.jpg",
55
+ "r100916_11.jpg",
56
+ "r100916_12.jpg",
57
+ "r100916_13.jpg",
58
+ "r100916_14.jpg",
59
+ "r100916_15.jpg"
60
+ ],
61
+ "conversations": [
62
+ {
63
+ "from": "human",
64
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
65
+ },
66
+ {
67
+ "from": "gpt",
68
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
69
+ }
70
+ ]
71
+ },
72
+ {
73
+ "id": "r004061",
74
+ "images": [
75
+ "r004061_00.jpg",
76
+ "r004061_01.jpg",
77
+ "r004061_02.jpg",
78
+ "r004061_03.jpg",
79
+ "r004061_04.jpg",
80
+ "r004061_05.jpg",
81
+ "r004061_06.jpg",
82
+ "r004061_07.jpg",
83
+ "r004061_08.jpg",
84
+ "r004061_09.jpg",
85
+ "r004061_10.jpg",
86
+ "r004061_11.jpg",
87
+ "r004061_12.jpg",
88
+ "r004061_13.jpg",
89
+ "r004061_14.jpg",
90
+ "r004061_15.jpg",
91
+ "r004061_16.jpg",
92
+ "r004061_17.jpg",
93
+ "r004061_18.jpg",
94
+ "r004061_19.jpg",
95
+ "r004061_20.jpg",
96
+ "r004061_21.jpg",
97
+ "r004061_22.jpg",
98
+ "r004061_23.jpg"
99
+ ],
100
+ "conversations": [
101
+ {
102
+ "from": "human",
103
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
104
+ },
105
+ {
106
+ "from": "gpt",
107
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
108
+ }
109
+ ]
110
+ },
111
+ {
112
+ "id": "r000228",
113
+ "images": [
114
+ "r000228_00.jpg",
115
+ "r000228_01.jpg",
116
+ "r000228_02.jpg",
117
+ "r000228_03.jpg",
118
+ "r000228_04.jpg",
119
+ "r000228_05.jpg",
120
+ "r000228_06.jpg",
121
+ "r000228_07.jpg",
122
+ "r000228_08.jpg",
123
+ "r000228_09.jpg",
124
+ "r000228_10.jpg",
125
+ "r000228_11.jpg",
126
+ "r000228_12.jpg",
127
+ "r000228_13.jpg",
128
+ "r000228_14.jpg",
129
+ "r000228_15.jpg",
130
+ "r000228_16.jpg",
131
+ "r000228_17.jpg",
132
+ "r000228_18.jpg",
133
+ "r000228_19.jpg",
134
+ "r000228_20.jpg",
135
+ "r000228_21.jpg",
136
+ "r000228_22.jpg",
137
+ "r000228_23.jpg"
138
+ ],
139
+ "conversations": [
140
+ {
141
+ "from": "human",
142
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is two people get into sports car,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
143
+ },
144
+ {
145
+ "from": "gpt",
146
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
147
+ }
148
+ ]
149
+ },
150
+ {
151
+ "id": "r100887",
152
+ "images": [
153
+ "r100887_00.jpg",
154
+ "r100887_01.jpg",
155
+ "r100887_02.jpg",
156
+ "r100887_03.jpg",
157
+ "r100887_04.jpg",
158
+ "r100887_05.jpg",
159
+ "r100887_06.jpg",
160
+ "r100887_07.jpg",
161
+ "r100887_08.jpg",
162
+ "r100887_09.jpg",
163
+ "r100887_10.jpg",
164
+ "r100887_11.jpg",
165
+ "r100887_12.jpg",
166
+ "r100887_13.jpg",
167
+ "r100887_14.jpg",
168
+ "r100887_15.jpg"
169
+ ],
170
+ "conversations": [
171
+ {
172
+ "from": "human",
173
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A person using a spatula to spread something on a pan.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
174
+ },
175
+ {
176
+ "from": "gpt",
177
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
178
+ }
179
+ ]
180
+ },
181
+ {
182
+ "id": "r104340",
183
+ "images": [
184
+ "r104340_00.jpg",
185
+ "r104340_01.jpg",
186
+ "r104340_02.jpg",
187
+ "r104340_03.jpg",
188
+ "r104340_04.jpg",
189
+ "r104340_05.jpg",
190
+ "r104340_06.jpg",
191
+ "r104340_07.jpg",
192
+ "r104340_08.jpg",
193
+ "r104340_09.jpg",
194
+ "r104340_10.jpg",
195
+ "r104340_11.jpg",
196
+ "r104340_12.jpg",
197
+ "r104340_13.jpg",
198
+ "r104340_14.jpg",
199
+ "r104340_15.jpg"
200
+ ],
201
+ "conversations": [
202
+ {
203
+ "from": "human",
204
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A bartender is making an alcoholic drink by pouring a clear liquid into a glass with ice cubes.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
205
+ },
206
+ {
207
+ "from": "gpt",
208
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
209
+ }
210
+ ]
211
+ },
212
+ {
213
+ "id": "r001599",
214
+ "images": [
215
+ "r001599_00.jpg",
216
+ "r001599_01.jpg",
217
+ "r001599_02.jpg",
218
+ "r001599_03.jpg",
219
+ "r001599_04.jpg",
220
+ "r001599_05.jpg",
221
+ "r001599_06.jpg",
222
+ "r001599_07.jpg",
223
+ "r001599_08.jpg",
224
+ "r001599_09.jpg",
225
+ "r001599_10.jpg",
226
+ "r001599_11.jpg",
227
+ "r001599_12.jpg",
228
+ "r001599_13.jpg",
229
+ "r001599_14.jpg",
230
+ "r001599_15.jpg",
231
+ "r001599_16.jpg",
232
+ "r001599_17.jpg",
233
+ "r001599_18.jpg",
234
+ "r001599_19.jpg",
235
+ "r001599_20.jpg",
236
+ "r001599_21.jpg",
237
+ "r001599_22.jpg",
238
+ "r001599_23.jpg"
239
+ ],
240
+ "conversations": [
241
+ {
242
+ "from": "human",
243
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man uses a camera,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
244
+ },
245
+ {
246
+ "from": "gpt",
247
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
248
+ }
249
+ ]
250
+ },
251
+ {
252
+ "id": "r101298",
253
+ "images": [
254
+ "r101298_00.jpg",
255
+ "r101298_01.jpg",
256
+ "r101298_02.jpg",
257
+ "r101298_03.jpg",
258
+ "r101298_04.jpg",
259
+ "r101298_05.jpg",
260
+ "r101298_06.jpg",
261
+ "r101298_07.jpg",
262
+ "r101298_08.jpg",
263
+ "r101298_09.jpg",
264
+ "r101298_10.jpg",
265
+ "r101298_11.jpg",
266
+ "r101298_12.jpg",
267
+ "r101298_13.jpg",
268
+ "r101298_14.jpg",
269
+ "r101298_15.jpg",
270
+ "r101298_16.jpg",
271
+ "r101298_17.jpg",
272
+ "r101298_18.jpg",
273
+ "r101298_19.jpg",
274
+ "r101298_20.jpg",
275
+ "r101298_21.jpg",
276
+ "r101298_22.jpg",
277
+ "r101298_23.jpg"
278
+ ],
279
+ "conversations": [
280
+ {
281
+ "from": "human",
282
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A baseball game is being played in a stadium with a large crowd of people watching.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
283
+ },
284
+ {
285
+ "from": "gpt",
286
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
287
+ }
288
+ ]
289
+ },
290
+ {
291
+ "id": "r000688",
292
+ "images": [
293
+ "r000688_00.jpg",
294
+ "r000688_01.jpg",
295
+ "r000688_02.jpg",
296
+ "r000688_03.jpg",
297
+ "r000688_04.jpg",
298
+ "r000688_05.jpg",
299
+ "r000688_06.jpg",
300
+ "r000688_07.jpg",
301
+ "r000688_08.jpg",
302
+ "r000688_09.jpg",
303
+ "r000688_10.jpg",
304
+ "r000688_11.jpg",
305
+ "r000688_12.jpg",
306
+ "r000688_13.jpg",
307
+ "r000688_14.jpg",
308
+ "r000688_15.jpg",
309
+ "r000688_16.jpg",
310
+ "r000688_17.jpg",
311
+ "r000688_18.jpg",
312
+ "r000688_19.jpg",
313
+ "r000688_20.jpg",
314
+ "r000688_21.jpg",
315
+ "r000688_22.jpg",
316
+ "r000688_23.jpg"
317
+ ],
318
+ "conversations": [
319
+ {
320
+ "from": "human",
321
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the child furthest to the right dips her hand into the bowl shared by the other children.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
322
+ },
323
+ {
324
+ "from": "gpt",
325
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
326
+ }
327
+ ]
328
+ },
329
+ {
330
+ "id": "r104350",
331
+ "images": [
332
+ "r104350_00.jpg",
333
+ "r104350_01.jpg",
334
+ "r104350_02.jpg",
335
+ "r104350_03.jpg",
336
+ "r104350_04.jpg",
337
+ "r104350_05.jpg",
338
+ "r104350_06.jpg",
339
+ "r104350_07.jpg",
340
+ "r104350_08.jpg",
341
+ "r104350_09.jpg",
342
+ "r104350_10.jpg",
343
+ "r104350_11.jpg",
344
+ "r104350_12.jpg",
345
+ "r104350_13.jpg",
346
+ "r104350_14.jpg",
347
+ "r104350_15.jpg",
348
+ "r104350_16.jpg",
349
+ "r104350_17.jpg",
350
+ "r104350_18.jpg",
351
+ "r104350_19.jpg",
352
+ "r104350_20.jpg",
353
+ "r104350_21.jpg",
354
+ "r104350_22.jpg",
355
+ "r104350_23.jpg"
356
+ ],
357
+ "conversations": [
358
+ {
359
+ "from": "human",
360
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is talking to a young woman in a room with books on the shelves.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
361
+ },
362
+ {
363
+ "from": "gpt",
364
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
365
+ }
366
+ ]
367
+ }
368
+ ]
examples/r000228/r000228_00.jpg ADDED
examples/r000228/r000228_01.jpg ADDED
examples/r000228/r000228_02.jpg ADDED
examples/r000228/r000228_03.jpg ADDED
examples/r000228/r000228_04.jpg ADDED
examples/r000228/r000228_05.jpg ADDED
examples/r000228/r000228_06.jpg ADDED
examples/r000228/r000228_07.jpg ADDED
examples/r000228/r000228_08.jpg ADDED
examples/r000228/r000228_09.jpg ADDED
examples/r000228/r000228_10.jpg ADDED
examples/r000228/r000228_11.jpg ADDED
examples/r000228/r000228_12.jpg ADDED
examples/r000228/r000228_13.jpg ADDED
examples/r000228/r000228_14.jpg ADDED
examples/r000228/r000228_15.jpg ADDED
examples/r000228/r000228_16.jpg ADDED
examples/r000228/r000228_17.jpg ADDED
examples/r000228/r000228_18.jpg ADDED
examples/r000228/r000228_19.jpg ADDED
examples/r000228/r000228_20.jpg ADDED
examples/r000228/r000228_21.jpg ADDED
examples/r000228/r000228_22.jpg ADDED
examples/r000228/r000228_23.jpg ADDED
examples/r000688/r000688_00.jpg ADDED
examples/r000688/r000688_01.jpg ADDED
examples/r000688/r000688_02.jpg ADDED
examples/r000688/r000688_03.jpg ADDED
examples/r000688/r000688_04.jpg ADDED
examples/r000688/r000688_05.jpg ADDED
examples/r000688/r000688_06.jpg ADDED
examples/r000688/r000688_07.jpg ADDED