svjack commited on
Commit
1cb24cf
1 Parent(s): 8687b75

Upload video_regression_with_prompt_script.py

Browse files
video_regression_with_prompt_script.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ python video_regression_with_prompt_script.py xiangling_mp4_dir_tiny
3
+
4
+ 1. **visual quality**:
5
+ - **涵义**: 视觉质量
6
+ - **解释**: 视频在清晰度、分辨率、亮度和色彩等方面的质量。这个维度评估视频的视觉表现,包括图像的清晰度、色彩的准确性和整体的视觉吸引力。
7
+
8
+ 2. **temporal consistency**:
9
+ - **涵义**: 时间一致性
10
+ - **解释**: 视频中物体或人物的一致性。这个维度评估视频在时间上的连贯性,即视频中的物体或人物在不同帧之间是否保持一致,没有明显的跳跃或不连贯的现象。
11
+
12
+ 3. **dynamic degree**:
13
+ - **涵义**: 动态程度
14
+ - **解释**: 视频中动态变化的程度。这个维度评估视频的动态性,即视频中物体或场景的变化程度,包括运动的频率和幅度。
15
+
16
+ 4. **text-to-video alignment**:
17
+ - **涵义**: 文本与视频的对齐
18
+ - **解释**: 文本提示与视频内容之间的对齐程度。这个维度评估视频内容与给定文本提示之间的匹配程度,即视频是否准确地反映了文本提示所描述的内容。
19
+
20
+ 5. **factual consistency**:
21
+ - **涵义**: 事实一致性
22
+ - **解释**: 视频内容与常识和事实知识的一致性。这个维度评估视频内容是否符合常识和事实知识,即视频中的内容是否真实可信,没有明显的逻辑错误或与现实不符的情况。
23
+
24
+ import pandas as pd
25
+ edf = pd.read_csv("evaluation_results.csv")
26
+ edf.describe()
27
+
28
+ print(edf.sort_values(by = "temporal consistency", ascending = True).head(5).to_markdown())
29
+
30
+ | | video_name | visual quality | temporal consistency | dynamic degree | text-to-video alignment | factual consistency |
31
+ |---:|:---------------------------------------------------------------------------------------------------|-----------------:|-----------------------:|-----------------:|--------------------------:|----------------------:|
32
+ | 26 | solo,Xiangling,_shave_with_a_razor__genshin_impact__,1girl,highres,_seed_3140464511.mp4 | 2.8 | 1.14 | 2.97 | 2.78 | 1.26 |
33
+ | 0 | solo,Xiangling,_carry_money_in_a_wallet__genshin_impact__,1girl,highres,_seed_1294598571.mp4 | 2.69 | 1.2 | 2.88 | 2.7 | 1.34 |
34
+ | 9 | solo,Xiangling,_sweep_dust_with_a_broom__genshin_impact__,1girl,highres,_seed_3483804345.mp4 | 2.72 | 1.22 | 2.89 | 2.86 | 1.2 |
35
+ | 25 | solo,Xiangling,_brush_teeth_with_a_toothbrush__genshin_impact__,1girl,highres,_seed_2612536091.mp4 | 2.75 | 1.23 | 2.91 | 2.67 | 1.44 |
36
+ | 14 | solo,Xiangling,_store_trash_in_a_bag__genshin_impact__,1girl,highres,_seed_4130052080.mp4 | 2.72 | 1.25 | 2.86 | 2.77 | 1.27 |
37
+ '''
38
+
39
+ import os
40
+ import time
41
+ import json
42
+ import numpy as np
43
+ import av
44
+ import torch
45
+ from PIL import Image
46
+ import functools
47
+ from transformers import AutoProcessor, AutoConfig
48
+ from models.idefics2 import Idefics2ForSequenceClassification
49
+ from models.conversation import conv_templates
50
+ from typing import List
51
+ import csv
52
+ import argparse
53
+ from tqdm import tqdm
54
+ import shutil
55
+
56
+ # 初始化模型和处理器
57
+ processor = AutoProcessor.from_pretrained("TIGER-Lab/VideoScore")
58
+ model = Idefics2ForSequenceClassification.from_pretrained("TIGER-Lab/VideoScore", torch_dtype=torch.bfloat16).eval()
59
+
60
+ MAX_NUM_FRAMES = 24
61
+ conv_template = conv_templates["idefics_2"]
62
+
63
+ VIDEO_EVAL_PROMPT = """
64
+ Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
65
+ please watch the following frames of a given video and see the text prompt for generating the video,
66
+ then give scores from 5 different dimensions:
67
+ (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
68
+ (2) temporal consistency, the consistency of objects or humans in video
69
+ (3) dynamic degree, the degree of dynamic changes
70
+ (4) text-to-video alignment, the alignment between the text prompt and the video content
71
+ (5) factual consistency, the consistency of the video content with the common-sense and factual knowledge
72
+
73
+ For each dimension, output a number from [1,2,3,4],
74
+ in which '1' means 'Bad', '2' means 'Average', '3' means 'Good',
75
+ '4' means 'Real' or 'Perfect' (the video is like a real video)
76
+ Here is an output example:
77
+ visual quality: 4
78
+ temporal consistency: 4
79
+ dynamic degree: 3
80
+ text-to-video alignment: 1
81
+ factual consistency: 2
82
+
83
+ For this video, the text prompt is "{text_prompt}",
84
+ all the frames of video are as follows:
85
+
86
+ """
87
+
88
+ aspect_mapping = [
89
+ "visual quality",
90
+ "temporal consistency",
91
+ "dynamic degree",
92
+ "text-to-video alignment",
93
+ "factual consistency",
94
+ ]
95
+
96
+ def score(prompt: str, images: List[Image.Image]):
97
+ if not prompt:
98
+ raise ValueError("Please provide a prompt")
99
+ model.to("cuda")
100
+ if not images:
101
+ images = None
102
+
103
+ flatten_images = []
104
+ for x in images:
105
+ if isinstance(x, list):
106
+ flatten_images.extend(x)
107
+ else:
108
+ flatten_images.append(x)
109
+
110
+ flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
111
+ inputs = processor(text=prompt, images=flatten_images, return_tensors="pt")
112
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
113
+ with torch.no_grad():
114
+ outputs = model(**inputs)
115
+
116
+ logits = outputs.logits
117
+ num_aspects = logits.shape[-1]
118
+ aspects = [aspect_mapping[i] for i in range(num_aspects)]
119
+
120
+ aspect_scores = {}
121
+ for i, aspect in enumerate(aspects):
122
+ aspect_scores[aspect] = round(logits[0, i].item(), 2)
123
+ return aspect_scores
124
+
125
+ def read_video_pyav(container, indices):
126
+ frames = []
127
+ container.seek(0)
128
+ start_index = indices[0]
129
+ end_index = indices[-1]
130
+ for i, frame in enumerate(container.decode(video=0)):
131
+ if i > end_index:
132
+ break
133
+ if i >= start_index and i in indices:
134
+ frames.append(frame)
135
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
136
+
137
+ def eval_video(prompt, video: str):
138
+ container = av.open(video)
139
+
140
+ total_frames = container.streams.video[0].frames
141
+ if total_frames > MAX_NUM_FRAMES:
142
+ indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
143
+ else:
144
+ indices = np.arange(total_frames)
145
+ video_frames = read_video_pyav(container, indices)
146
+
147
+ frames = [Image.fromarray(x) for x in video_frames]
148
+
149
+ eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
150
+
151
+ num_image_token = eval_prompt.count("<image>")
152
+ if num_image_token < len(frames):
153
+ eval_prompt += "<image> " * (len(frames) - num_image_token)
154
+
155
+ aspect_scores = score(eval_prompt, [frames])
156
+ return aspect_scores
157
+
158
+ def main():
159
+ parser = argparse.ArgumentParser(description="Evaluate videos in a directory.")
160
+ parser.add_argument("video_dir", type=str, help="Directory containing video files.")
161
+ args = parser.parse_args()
162
+
163
+ video_files = [os.path.join(args.video_dir, f) for f in os.listdir(args.video_dir) if f.endswith(('.mp4', '.avi', '.mkv'))]
164
+
165
+ # 创建五个指标对应的文件夹
166
+ output_dirs = {aspect: f"{aspect}_videos" for aspect in aspect_mapping}
167
+ for dir_name in output_dirs.values():
168
+ os.makedirs(dir_name, exist_ok=True)
169
+
170
+ with open("evaluation_results.csv", "w", newline='') as csvfile:
171
+ fieldnames = ["video_name", "visual quality", "temporal consistency", "dynamic degree", "text-to-video alignment", "factual consistency"]
172
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
173
+ writer.writeheader()
174
+
175
+ for video_file in tqdm(video_files, desc="Evaluating videos"):
176
+ video_name = os.path.basename(video_file)
177
+ txt_file = os.path.splitext(video_file)[0] + ".txt"
178
+
179
+ if not os.path.exists(txt_file):
180
+ raise FileNotFoundError(f"Text prompt file {txt_file} not found for video {video_name}")
181
+
182
+ with open(txt_file, 'r') as f:
183
+ prompt = f.read().strip()
184
+
185
+ aspect_scores = eval_video(prompt, video_file)
186
+ aspect_scores["video_name"] = video_name
187
+ writer.writerow(aspect_scores)
188
+
189
+ # 将视频文件复制到对应的文件夹中,并以指标值为名称保存
190
+ for aspect, score in aspect_scores.items():
191
+ if aspect != "video_name":
192
+ score_str = f"{score:.2f}".replace('.', '_') # 将小数点替换为下划线以便于排序
193
+ new_video_name = f"{score_str}_{video_name}"
194
+ output_dir = output_dirs[aspect]
195
+ shutil.copy(video_file, os.path.join(output_dir, new_video_name))
196
+
197
+ if __name__ == "__main__":
198
+ main()