fix eva in data_utils
Browse files- data_utils.py +34 -18
data_utils.py
CHANGED
@@ -1,11 +1,29 @@
|
|
1 |
import decord
|
2 |
-
import numpy as np
|
3 |
-
import torch
|
4 |
-
from PIL import Image
|
5 |
import random
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
from eva_clip.transform import image_transform
|
8 |
-
image_processor = image_transform(image_size=448, is_train=False)
|
9 |
|
10 |
def preprocess_multimodal(sources, num_segments):
|
11 |
for source in sources:
|
@@ -26,6 +44,7 @@ def preprocess_multimodal(sources, num_segments):
|
|
26 |
sentence["content"] = sentence["content"].replace(X_token, replace_token)
|
27 |
return sources
|
28 |
|
|
|
29 |
def preprocess(
|
30 |
sources,
|
31 |
tokenizer,
|
@@ -60,9 +79,6 @@ def preprocess(
|
|
60 |
else:
|
61 |
index = random.choice(range(len(en_qa_templates)))
|
62 |
system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手,{ch_qa_templates[index]}"""
|
63 |
-
chat_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>'
|
64 |
-
+ message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}
|
65 |
-
{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"""
|
66 |
messages = []
|
67 |
for source in sources:
|
68 |
message = [{'role': 'system', 'content': system_prompt}]
|
@@ -70,14 +86,14 @@ def preprocess(
|
|
70 |
message.append(sentence)
|
71 |
messages.append(message)
|
72 |
|
73 |
-
#input_ids = tokenizer.apply_chat_template(messages, chat_template, add_generation_prompt=True, return_tensors='pt')
|
74 |
input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
|
75 |
return input_ids
|
76 |
-
|
|
|
77 |
def get_index(fps, max_frame, num_segments):
|
78 |
num_frames = max_frame
|
79 |
if num_frames <= num_segments:
|
80 |
-
out_indices =
|
81 |
out_indices = np.sort(out_indices)
|
82 |
else:
|
83 |
out_indices = np.linspace(0, num_frames-1, num_segments)
|
@@ -85,22 +101,23 @@ def get_index(fps, max_frame, num_segments):
|
|
85 |
durations = [idx.item() / fps for idx in out_indices]
|
86 |
return out_indices.astype(np.int64), durations
|
87 |
|
|
|
88 |
def read_video(video_path, num_segments):
|
|
|
89 |
vr = decord.VideoReader(video_path)
|
90 |
-
max_frame = len(vr) - 1
|
91 |
fps = float(vr.get_avg_fps())
|
92 |
|
93 |
-
|
94 |
-
frame_indices, durations = get_index(fps, max_frame, num_segments)
|
95 |
video = []
|
96 |
for frame_index in frame_indices:
|
97 |
image = Image.fromarray(vr[frame_index].asnumpy())
|
98 |
video.append(image_processor(image).unsqueeze(0))
|
99 |
video = torch.concat(video)
|
100 |
-
return video, torch.Tensor(durations)
|
|
|
101 |
|
102 |
def get_input(video_path, num_segments, question, history, tokenizer, s_id):
|
103 |
-
video, durations
|
104 |
if history == None:
|
105 |
conversations = []
|
106 |
conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
|
@@ -113,8 +130,7 @@ def get_input(video_path, num_segments, question, history, tokenizer, s_id):
|
|
113 |
|
114 |
return video, durations, input_ids, conversations
|
115 |
|
|
|
116 |
def add_pred_to_history(history, pred):
|
117 |
history.append({'role': 'assistant', 'content': pred})
|
118 |
return history
|
119 |
-
|
120 |
-
|
|
|
1 |
import decord
|
|
|
|
|
|
|
2 |
import random
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from torchvision.transforms import Normalize, Compose, InterpolationMode, ToTensor, Resize
|
8 |
+
|
9 |
+
|
10 |
+
def _convert_to_rgb(image):
|
11 |
+
return image.convert('RGB')
|
12 |
+
|
13 |
+
|
14 |
+
def image_transform(image_size: int):
|
15 |
+
mean = (0.48145466, 0.4578275, 0.40821073)
|
16 |
+
std = (0.26862954, 0.26130258, 0.27577711)
|
17 |
+
|
18 |
+
normalize = Normalize(mean=mean, std=std)
|
19 |
+
transforms = [
|
20 |
+
Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
|
21 |
+
_convert_to_rgb,
|
22 |
+
ToTensor(),
|
23 |
+
normalize,
|
24 |
+
]
|
25 |
+
return Compose(transforms)
|
26 |
|
|
|
|
|
27 |
|
28 |
def preprocess_multimodal(sources, num_segments):
|
29 |
for source in sources:
|
|
|
44 |
sentence["content"] = sentence["content"].replace(X_token, replace_token)
|
45 |
return sources
|
46 |
|
47 |
+
|
48 |
def preprocess(
|
49 |
sources,
|
50 |
tokenizer,
|
|
|
79 |
else:
|
80 |
index = random.choice(range(len(en_qa_templates)))
|
81 |
system_prompt = f"""You are a helpful assistant, {en_qa_templates[index]} 你是一个乐于助人的助手,{ch_qa_templates[index]}"""
|
|
|
|
|
|
|
82 |
messages = []
|
83 |
for source in sources:
|
84 |
message = [{'role': 'system', 'content': system_prompt}]
|
|
|
86 |
message.append(sentence)
|
87 |
messages.append(message)
|
88 |
|
|
|
89 |
input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors='pt')
|
90 |
return input_ids
|
91 |
+
|
92 |
+
|
93 |
def get_index(fps, max_frame, num_segments):
|
94 |
num_frames = max_frame
|
95 |
if num_frames <= num_segments:
|
96 |
+
out_indices = np.array([(idx % num_frames) for idx in range(num_segments)])
|
97 |
out_indices = np.sort(out_indices)
|
98 |
else:
|
99 |
out_indices = np.linspace(0, num_frames-1, num_segments)
|
|
|
101 |
durations = [idx.item() / fps for idx in out_indices]
|
102 |
return out_indices.astype(np.int64), durations
|
103 |
|
104 |
+
|
105 |
def read_video(video_path, num_segments):
|
106 |
+
image_processor = image_transform(image_size=448)
|
107 |
vr = decord.VideoReader(video_path)
|
|
|
108 |
fps = float(vr.get_avg_fps())
|
109 |
|
110 |
+
frame_indices, durations = get_index(fps, len(vr) - 1, num_segments)
|
|
|
111 |
video = []
|
112 |
for frame_index in frame_indices:
|
113 |
image = Image.fromarray(vr[frame_index].asnumpy())
|
114 |
video.append(image_processor(image).unsqueeze(0))
|
115 |
video = torch.concat(video)
|
116 |
+
return video, torch.Tensor(durations)
|
117 |
+
|
118 |
|
119 |
def get_input(video_path, num_segments, question, history, tokenizer, s_id):
|
120 |
+
video, durations = read_video(video_path, num_segments)
|
121 |
if history == None:
|
122 |
conversations = []
|
123 |
conversations.append({'role': 'user', 'content': f'<video>\n{question}'})
|
|
|
130 |
|
131 |
return video, durations, input_ids, conversations
|
132 |
|
133 |
+
|
134 |
def add_pred_to_history(history, pred):
|
135 |
history.append({'role': 'assistant', 'content': pred})
|
136 |
return history
|
|
|
|