Spaces:
Running
Running
Upload video_regression_with_prompt_script.py
Browse files
video_regression_with_prompt_script.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
python video_regression_with_prompt_script.py xiangling_mp4_dir_tiny
|
3 |
+
|
4 |
+
1. **visual quality**:
|
5 |
+
- **涵义**: 视觉质量
|
6 |
+
- **解释**: 视频在清晰度、分辨率、亮度和色彩等方面的质量。这个维度评估视频的视觉表现,包括图像的清晰度、色彩的准确性和整体的视觉吸引力。
|
7 |
+
|
8 |
+
2. **temporal consistency**:
|
9 |
+
- **涵义**: 时间一致性
|
10 |
+
- **解释**: 视频中物体或人物的一致性。这个维度评估视频在时间上的连贯性,即视频中的物体或人物在不同帧之间是否保持一致,没有明显的跳跃或不连贯的现象。
|
11 |
+
|
12 |
+
3. **dynamic degree**:
|
13 |
+
- **涵义**: 动态程度
|
14 |
+
- **解释**: 视频中动态变化的程度。这个维度评估视频的动态性,即视频中物体或场景的变化程度,包括运动的频率和幅度。
|
15 |
+
|
16 |
+
4. **text-to-video alignment**:
|
17 |
+
- **涵义**: 文本与视频的对齐
|
18 |
+
- **解释**: 文本提示与视频内容之间的对齐程度。这个维度评估视频内容与给定文本提示之间的匹配程度,即视频是否准确地反映了文本提示所描述的内容。
|
19 |
+
|
20 |
+
5. **factual consistency**:
|
21 |
+
- **涵义**: 事实一致性
|
22 |
+
- **解释**: 视频内容与常识和事实知识的一致性。这个维度评估视频内容是否符合常识和事实知识,即视频中的内容是否真实可信,没有明显的逻辑错误或与现实不符的情况。
|
23 |
+
|
24 |
+
import pandas as pd
|
25 |
+
edf = pd.read_csv("evaluation_results.csv")
|
26 |
+
edf.describe()
|
27 |
+
|
28 |
+
print(edf.sort_values(by = "temporal consistency", ascending = True).head(5).to_markdown())
|
29 |
+
|
30 |
+
| | video_name | visual quality | temporal consistency | dynamic degree | text-to-video alignment | factual consistency |
|
31 |
+
|---:|:---------------------------------------------------------------------------------------------------|-----------------:|-----------------------:|-----------------:|--------------------------:|----------------------:|
|
32 |
+
| 26 | solo,Xiangling,_shave_with_a_razor__genshin_impact__,1girl,highres,_seed_3140464511.mp4 | 2.8 | 1.14 | 2.97 | 2.78 | 1.26 |
|
33 |
+
| 0 | solo,Xiangling,_carry_money_in_a_wallet__genshin_impact__,1girl,highres,_seed_1294598571.mp4 | 2.69 | 1.2 | 2.88 | 2.7 | 1.34 |
|
34 |
+
| 9 | solo,Xiangling,_sweep_dust_with_a_broom__genshin_impact__,1girl,highres,_seed_3483804345.mp4 | 2.72 | 1.22 | 2.89 | 2.86 | 1.2 |
|
35 |
+
| 25 | solo,Xiangling,_brush_teeth_with_a_toothbrush__genshin_impact__,1girl,highres,_seed_2612536091.mp4 | 2.75 | 1.23 | 2.91 | 2.67 | 1.44 |
|
36 |
+
| 14 | solo,Xiangling,_store_trash_in_a_bag__genshin_impact__,1girl,highres,_seed_4130052080.mp4 | 2.72 | 1.25 | 2.86 | 2.77 | 1.27 |
|
37 |
+
'''
|
38 |
+
|
39 |
+
import os
|
40 |
+
import time
|
41 |
+
import json
|
42 |
+
import numpy as np
|
43 |
+
import av
|
44 |
+
import torch
|
45 |
+
from PIL import Image
|
46 |
+
import functools
|
47 |
+
from transformers import AutoProcessor, AutoConfig
|
48 |
+
from models.idefics2 import Idefics2ForSequenceClassification
|
49 |
+
from models.conversation import conv_templates
|
50 |
+
from typing import List
|
51 |
+
import csv
|
52 |
+
import argparse
|
53 |
+
from tqdm import tqdm
|
54 |
+
import shutil
|
55 |
+
|
56 |
+
# 初始化模型和处理器
|
57 |
+
processor = AutoProcessor.from_pretrained("TIGER-Lab/VideoScore")
|
58 |
+
model = Idefics2ForSequenceClassification.from_pretrained("TIGER-Lab/VideoScore", torch_dtype=torch.bfloat16).eval()
|
59 |
+
|
60 |
+
MAX_NUM_FRAMES = 24
|
61 |
+
conv_template = conv_templates["idefics_2"]
|
62 |
+
|
63 |
+
VIDEO_EVAL_PROMPT = """
|
64 |
+
Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
|
65 |
+
please watch the following frames of a given video and see the text prompt for generating the video,
|
66 |
+
then give scores from 5 different dimensions:
|
67 |
+
(1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
|
68 |
+
(2) temporal consistency, the consistency of objects or humans in video
|
69 |
+
(3) dynamic degree, the degree of dynamic changes
|
70 |
+
(4) text-to-video alignment, the alignment between the text prompt and the video content
|
71 |
+
(5) factual consistency, the consistency of the video content with the common-sense and factual knowledge
|
72 |
+
|
73 |
+
For each dimension, output a number from [1,2,3,4],
|
74 |
+
in which '1' means 'Bad', '2' means 'Average', '3' means 'Good',
|
75 |
+
'4' means 'Real' or 'Perfect' (the video is like a real video)
|
76 |
+
Here is an output example:
|
77 |
+
visual quality: 4
|
78 |
+
temporal consistency: 4
|
79 |
+
dynamic degree: 3
|
80 |
+
text-to-video alignment: 1
|
81 |
+
factual consistency: 2
|
82 |
+
|
83 |
+
For this video, the text prompt is "{text_prompt}",
|
84 |
+
all the frames of video are as follows:
|
85 |
+
|
86 |
+
"""
|
87 |
+
|
88 |
+
aspect_mapping = [
|
89 |
+
"visual quality",
|
90 |
+
"temporal consistency",
|
91 |
+
"dynamic degree",
|
92 |
+
"text-to-video alignment",
|
93 |
+
"factual consistency",
|
94 |
+
]
|
95 |
+
|
96 |
+
def score(prompt: str, images: List[Image.Image]):
|
97 |
+
if not prompt:
|
98 |
+
raise ValueError("Please provide a prompt")
|
99 |
+
model.to("cuda")
|
100 |
+
if not images:
|
101 |
+
images = None
|
102 |
+
|
103 |
+
flatten_images = []
|
104 |
+
for x in images:
|
105 |
+
if isinstance(x, list):
|
106 |
+
flatten_images.extend(x)
|
107 |
+
else:
|
108 |
+
flatten_images.append(x)
|
109 |
+
|
110 |
+
flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
|
111 |
+
inputs = processor(text=prompt, images=flatten_images, return_tensors="pt")
|
112 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
113 |
+
with torch.no_grad():
|
114 |
+
outputs = model(**inputs)
|
115 |
+
|
116 |
+
logits = outputs.logits
|
117 |
+
num_aspects = logits.shape[-1]
|
118 |
+
aspects = [aspect_mapping[i] for i in range(num_aspects)]
|
119 |
+
|
120 |
+
aspect_scores = {}
|
121 |
+
for i, aspect in enumerate(aspects):
|
122 |
+
aspect_scores[aspect] = round(logits[0, i].item(), 2)
|
123 |
+
return aspect_scores
|
124 |
+
|
125 |
+
def read_video_pyav(container, indices):
|
126 |
+
frames = []
|
127 |
+
container.seek(0)
|
128 |
+
start_index = indices[0]
|
129 |
+
end_index = indices[-1]
|
130 |
+
for i, frame in enumerate(container.decode(video=0)):
|
131 |
+
if i > end_index:
|
132 |
+
break
|
133 |
+
if i >= start_index and i in indices:
|
134 |
+
frames.append(frame)
|
135 |
+
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
|
136 |
+
|
137 |
+
def eval_video(prompt, video: str):
|
138 |
+
container = av.open(video)
|
139 |
+
|
140 |
+
total_frames = container.streams.video[0].frames
|
141 |
+
if total_frames > MAX_NUM_FRAMES:
|
142 |
+
indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
|
143 |
+
else:
|
144 |
+
indices = np.arange(total_frames)
|
145 |
+
video_frames = read_video_pyav(container, indices)
|
146 |
+
|
147 |
+
frames = [Image.fromarray(x) for x in video_frames]
|
148 |
+
|
149 |
+
eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
|
150 |
+
|
151 |
+
num_image_token = eval_prompt.count("<image>")
|
152 |
+
if num_image_token < len(frames):
|
153 |
+
eval_prompt += "<image> " * (len(frames) - num_image_token)
|
154 |
+
|
155 |
+
aspect_scores = score(eval_prompt, [frames])
|
156 |
+
return aspect_scores
|
157 |
+
|
158 |
+
def main():
|
159 |
+
parser = argparse.ArgumentParser(description="Evaluate videos in a directory.")
|
160 |
+
parser.add_argument("video_dir", type=str, help="Directory containing video files.")
|
161 |
+
args = parser.parse_args()
|
162 |
+
|
163 |
+
video_files = [os.path.join(args.video_dir, f) for f in os.listdir(args.video_dir) if f.endswith(('.mp4', '.avi', '.mkv'))]
|
164 |
+
|
165 |
+
# 创建五个指标对应的文件夹
|
166 |
+
output_dirs = {aspect: f"{aspect}_videos" for aspect in aspect_mapping}
|
167 |
+
for dir_name in output_dirs.values():
|
168 |
+
os.makedirs(dir_name, exist_ok=True)
|
169 |
+
|
170 |
+
with open("evaluation_results.csv", "w", newline='') as csvfile:
|
171 |
+
fieldnames = ["video_name", "visual quality", "temporal consistency", "dynamic degree", "text-to-video alignment", "factual consistency"]
|
172 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
173 |
+
writer.writeheader()
|
174 |
+
|
175 |
+
for video_file in tqdm(video_files, desc="Evaluating videos"):
|
176 |
+
video_name = os.path.basename(video_file)
|
177 |
+
txt_file = os.path.splitext(video_file)[0] + ".txt"
|
178 |
+
|
179 |
+
if not os.path.exists(txt_file):
|
180 |
+
raise FileNotFoundError(f"Text prompt file {txt_file} not found for video {video_name}")
|
181 |
+
|
182 |
+
with open(txt_file, 'r') as f:
|
183 |
+
prompt = f.read().strip()
|
184 |
+
|
185 |
+
aspect_scores = eval_video(prompt, video_file)
|
186 |
+
aspect_scores["video_name"] = video_name
|
187 |
+
writer.writerow(aspect_scores)
|
188 |
+
|
189 |
+
# 将视频文件复制到对应的文件夹中,并以指标值为名称保存
|
190 |
+
for aspect, score in aspect_scores.items():
|
191 |
+
if aspect != "video_name":
|
192 |
+
score_str = f"{score:.2f}".replace('.', '_') # 将小数点替换为下划线以便于排序
|
193 |
+
new_video_name = f"{score_str}_{video_name}"
|
194 |
+
output_dir = output_dirs[aspect]
|
195 |
+
shutil.copy(video_file, os.path.join(output_dir, new_video_name))
|
196 |
+
|
197 |
+
if __name__ == "__main__":
|
198 |
+
main()
|