Upload files with `vila-upload`.
Browse filesUpload utils.py
Upload auto_processor.py
Upload siglip_encoder.py
Upload README.md
Upload mm_utils.py
Upload builder.py
Upload config.json
Upload modeling_vila.py
Upload llm/vocab.json
Upload llm/tokenizer_config.json
Upload llm/added_tokens.json
Upload llm/tokenizer.json
- .gitattributes +1 -0
- README.md +66 -4
- auto_processor.py +234 -0
- builder.py +0 -1
- config.json +1 -0
- llm/added_tokens.json +3 -0
- llm/tokenizer.json +3 -0
- llm/tokenizer_config.json +24 -0
- llm/vocab.json +0 -0
- mm_utils.py +1 -1
- modeling_vila.py +45 -13
- siglip_encoder.py +6 -8
- utils.py +34 -2
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -2,8 +2,6 @@
|
|
2 |
license: cc
|
3 |
language:
|
4 |
- en
|
5 |
-
base_model:
|
6 |
-
- Qwen/Qwen2.5-1.5B-Instruct
|
7 |
---
|
8 |
|
9 |
Dependency setups:
|
@@ -13,6 +11,7 @@ pip install transformers==4.46 accelerate opencv-python torchvision einops
|
|
13 |
pip install git+https://github.com/bfshi/scaling_on_scales.git
|
14 |
```
|
15 |
|
|
|
16 |
|
17 |
```python
|
18 |
from transformers import AutoConfig, AutoModel
|
@@ -20,9 +19,13 @@ from termcolor import colored
|
|
20 |
|
21 |
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
|
22 |
|
23 |
-
#
|
24 |
-
|
|
|
|
|
25 |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
|
|
|
|
26 |
res = model.generate_content([
|
27 |
"how are you today?"
|
28 |
])
|
@@ -30,10 +33,69 @@ print(colored(res, "cyan", attrs=["bold"]))
|
|
30 |
|
31 |
print("---" * 40)
|
32 |
|
|
|
33 |
import PIL.Image
|
34 |
response = model.generate_content([
|
35 |
PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
|
36 |
"describe the image?"
|
37 |
])
|
38 |
print(colored(response, "cyan", attrs=["bold"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
```
|
|
|
2 |
license: cc
|
3 |
language:
|
4 |
- en
|
|
|
|
|
5 |
---
|
6 |
|
7 |
Dependency setups:
|
|
|
11 |
pip install git+https://github.com/bfshi/scaling_on_scales.git
|
12 |
```
|
13 |
|
14 |
+
## Usage
|
15 |
|
16 |
```python
|
17 |
from transformers import AutoConfig, AutoModel
|
|
|
19 |
|
20 |
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
|
21 |
|
22 |
+
# you can use config
|
23 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
24 |
+
model = AutoModel.from_config(config, trust_remote_code=True)
|
25 |
+
# or directly from_pretrained
|
26 |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
27 |
+
|
28 |
+
# examples generate with raw text
|
29 |
res = model.generate_content([
|
30 |
"how are you today?"
|
31 |
])
|
|
|
33 |
|
34 |
print("---" * 40)
|
35 |
|
36 |
+
# examples generate with text + image
|
37 |
import PIL.Image
|
38 |
response = model.generate_content([
|
39 |
PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
|
40 |
"describe the image?"
|
41 |
])
|
42 |
print(colored(response, "cyan", attrs=["bold"]))
|
43 |
+
```
|
44 |
+
|
45 |
+
## AutoProcessor
|
46 |
+
|
47 |
+
we also support `AutoProcessor` class if you want to do finetune
|
48 |
+
|
49 |
+
```python
|
50 |
+
from transformers import AutoProcessor, AutoModel
|
51 |
+
|
52 |
+
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
|
53 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
54 |
+
|
55 |
+
gpt_conv = [ {
|
56 |
+
"role": "user",
|
57 |
+
"content": [
|
58 |
+
{"type": "image", "path": "demo_images/demo_img_1.png"},
|
59 |
+
{"type": "text", "text": "Describe this image."}
|
60 |
+
]
|
61 |
+
}]
|
62 |
+
|
63 |
+
inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
|
64 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
65 |
+
output_ids = model.generate(
|
66 |
+
input_ids=inputs.input_ids,
|
67 |
+
media={
|
68 |
+
"image": inputs.image,
|
69 |
+
},
|
70 |
+
media_config={
|
71 |
+
"image": {}
|
72 |
+
},
|
73 |
+
generation_config=model.generation_config,
|
74 |
+
max_new_tokens=256,
|
75 |
+
)
|
76 |
+
print(processor.tokenizer.decode(output_ids[0], skip_special_tokens=True))
|
77 |
+
|
78 |
+
##### the above code is equivalent to
|
79 |
+
# response = model.generate_content([
|
80 |
+
# PIL.Image.open("demo_images/demo_img_1.png"),
|
81 |
+
# "describe the image?"
|
82 |
+
# ])
|
83 |
+
# print(colored(response, "cyan", attrs=["bold"]))
|
84 |
+
```
|
85 |
+
|
86 |
+
## Model Convert
|
87 |
+
|
88 |
+
The follwing code converts a convetional NVILA model to a HF compatible model.
|
89 |
+
|
90 |
+
```python
|
91 |
+
import os, os.path as osp
|
92 |
+
from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
|
93 |
+
|
94 |
+
model_path = "Efficient-Large-Model/NVILA-Lite-2B"
|
95 |
+
output_dir = "NVILA-Lite-2B-hf-preview"
|
96 |
+
|
97 |
+
if osp.isdir(output_dir):
|
98 |
+
shutil.rmtree(output_dir)
|
99 |
+
from llava.remote_code.modeling_vila import VILAForCasualLM
|
100 |
+
VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
|
101 |
```
|
auto_processor.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, os.path as osp
|
2 |
+
from collections import defaultdict
|
3 |
+
from typing import List, Union
|
4 |
+
|
5 |
+
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoImageProcessor, AutoProcessor
|
6 |
+
from transformers.feature_extraction_utils import BatchFeature
|
7 |
+
from transformers.image_utils import ImageInput, VideoInput
|
8 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
9 |
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
10 |
+
from transformers.utils import logging
|
11 |
+
|
12 |
+
from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
|
13 |
+
from .media import Image, Video
|
14 |
+
from .mm_utils import process_image, process_images
|
15 |
+
from .media import extract_media
|
16 |
+
from .tokenizer_utils import tokenize_conversation
|
17 |
+
|
18 |
+
|
19 |
+
class VILAProcessorKwargs(ProcessingKwargs, total=False):
|
20 |
+
_defaults = {
|
21 |
+
"text_kwargs": {
|
22 |
+
"padding": False,
|
23 |
+
},
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
class VILAProcessor(ProcessorMixin):
|
28 |
+
# attributes = ["image_processor", "tokenizer"]
|
29 |
+
attributes = []
|
30 |
+
# valid_kwargs = ["chat_template"]
|
31 |
+
valid_kwargs = []
|
32 |
+
# image_processor_class = "VILAImageProcessor"
|
33 |
+
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
34 |
+
|
35 |
+
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
|
36 |
+
# self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
37 |
+
# self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
38 |
+
self.image_token = MEDIA_TOKENS["image"]
|
39 |
+
self.video_token = MEDIA_TOKENS["video"]
|
40 |
+
self.config = config
|
41 |
+
self.image_processor = image_processor
|
42 |
+
self.tokenizer = tokenizer
|
43 |
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
44 |
+
|
45 |
+
@classmethod
|
46 |
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
47 |
+
if os.path.isdir(pretrained_model_name_or_path):
|
48 |
+
pretrained_model_name_or_path = pretrained_model_name_or_path
|
49 |
+
else:
|
50 |
+
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
51 |
+
from huggingface_hub import HfApi, snapshot_download
|
52 |
+
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
53 |
+
|
54 |
+
image_processor = AutoImageProcessor.from_pretrained(osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True)
|
55 |
+
tokenizer = AutoTokenizer.from_pretrained(osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True)
|
56 |
+
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
|
57 |
+
|
58 |
+
return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
|
59 |
+
|
60 |
+
def __repr__(self):
|
61 |
+
return f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
|
62 |
+
|
63 |
+
def __call__(
|
64 |
+
self,
|
65 |
+
conversation,
|
66 |
+
images: ImageInput = None,
|
67 |
+
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
68 |
+
videos: VideoInput = None,
|
69 |
+
**kwargs: Unpack[VILAProcessorKwargs],
|
70 |
+
) -> BatchFeature:
|
71 |
+
# TODO: should be merged with llava_arch.py/generate_content()
|
72 |
+
# TODO (extract and preprocess should be done together, as the preprocess of image and video can be different, i.e. when dynamic res is used)
|
73 |
+
media = extract_media(conversation, self.config)
|
74 |
+
# Process media
|
75 |
+
media_config = defaultdict(dict)
|
76 |
+
for name in media:
|
77 |
+
if name == "image":
|
78 |
+
if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
|
79 |
+
self.config.image_processor = self.image_processor
|
80 |
+
if self.config.image_aspect_ratio == "dynamic":
|
81 |
+
images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
|
82 |
+
conversation[0]["value"] = conversation[0]["value"].replace(
|
83 |
+
DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
|
84 |
+
)
|
85 |
+
else:
|
86 |
+
if type(self.config.s2_scales) is str:
|
87 |
+
self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
|
88 |
+
images, block_sizes = process_image(
|
89 |
+
media["image"][0], self.config, None, enable_dynamic_s2=True
|
90 |
+
)
|
91 |
+
images = images.half()
|
92 |
+
media_config[name]["block_sizes"] = [block_sizes]
|
93 |
+
else:
|
94 |
+
images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
|
95 |
+
media[name] = [image for image in images]
|
96 |
+
elif name == "video":
|
97 |
+
media[name] = [
|
98 |
+
process_images(images, self.vision_tower.image_processor, self.config).half()
|
99 |
+
for images in media[name]
|
100 |
+
]
|
101 |
+
else:
|
102 |
+
raise ValueError(f"Unsupported media type: {name}")
|
103 |
+
|
104 |
+
input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
|
105 |
+
# Set up the generation config
|
106 |
+
# print(input_ids.shape); print(media); input()
|
107 |
+
return BatchFeature(data={"input_ids": input_ids, **media})
|
108 |
+
|
109 |
+
def batch_decode(self, *args, **kwargs):
|
110 |
+
"""
|
111 |
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
112 |
+
refer to the docstring of this method for more information.
|
113 |
+
"""
|
114 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
115 |
+
|
116 |
+
def decode(self, *args, **kwargs):
|
117 |
+
"""
|
118 |
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
119 |
+
the docstring of this method for more information.
|
120 |
+
"""
|
121 |
+
return self.tokenizer.decode(*args, **kwargs)
|
122 |
+
|
123 |
+
def post_process_image_text_to_text(self, generated_outputs):
|
124 |
+
"""
|
125 |
+
Post-process the output of the model to decode the text.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
generated_outputs (`torch.Tensor` or `np.ndarray`):
|
129 |
+
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
|
130 |
+
or `(sequence_length,)`.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
`List[str]`: The decoded text.
|
134 |
+
"""
|
135 |
+
return self.tokenizer.batch_decode(
|
136 |
+
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
137 |
+
)
|
138 |
+
|
139 |
+
@property
|
140 |
+
def model_input_names(self):
|
141 |
+
tokenizer_input_names = self.tokenizer.model_input_names
|
142 |
+
image_processor_input_names = self.image_processor.model_input_names
|
143 |
+
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
144 |
+
|
145 |
+
# inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
|
146 |
+
def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
|
147 |
+
vila_conv = []
|
148 |
+
|
149 |
+
for chat in conversation:
|
150 |
+
vila_chat = {
|
151 |
+
"from": "",
|
152 |
+
"value": []
|
153 |
+
}
|
154 |
+
if chat["role"] == "user":
|
155 |
+
# user allows to input image and text
|
156 |
+
vila_chat["from"] = "human"
|
157 |
+
for content in chat["content"]:
|
158 |
+
if content["type"] == "image":
|
159 |
+
vila_chat["value"].append(Image(content["path"]))
|
160 |
+
elif content["type"] == "text":
|
161 |
+
vila_chat["value"].append(content["text"])
|
162 |
+
else:
|
163 |
+
raise ValueError(f"Unsupported content type: {content['type']}")
|
164 |
+
elif chat["role"] == "assistant":
|
165 |
+
vila_chat["from"] = "gpt"
|
166 |
+
for content in chat["content"]:
|
167 |
+
assert content["type"] == "text", f"Unsupported content type: {content['type']}"
|
168 |
+
vila_chat["value"].append(content["text"])
|
169 |
+
vila_conv.append(vila_chat)
|
170 |
+
|
171 |
+
return self(vila_conv)
|
172 |
+
|
173 |
+
if __name__ == "__main__":
|
174 |
+
# gpt style: user, assistant
|
175 |
+
# vila style: human, gpt
|
176 |
+
gpt_conv = [
|
177 |
+
{
|
178 |
+
"role": "user",
|
179 |
+
"content": [
|
180 |
+
{"type": "image", "path": "demo_images/demo_img_1.png"},
|
181 |
+
{"type": "text", "text": "Describe this image."}
|
182 |
+
]
|
183 |
+
}
|
184 |
+
]
|
185 |
+
|
186 |
+
llavaconv = [
|
187 |
+
{
|
188 |
+
"from": "human",
|
189 |
+
"value": [
|
190 |
+
PIL.Image.open("demo_images/demo_img_1.png"),
|
191 |
+
"Describe this image.",
|
192 |
+
],
|
193 |
+
}
|
194 |
+
]
|
195 |
+
|
196 |
+
processor = AutoProcessor.from_pretrained(output_dir, trust_remote_code=True)
|
197 |
+
inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
|
198 |
+
# model = llava.load("Efficient-Large-Model/qwen25_2B_3x3-sft").cuda()
|
199 |
+
# print(model)
|
200 |
+
model_path = "NVILA-Lite-2B-hf-preview"
|
201 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
202 |
+
# res = model.generate_content(["how are you today?"])
|
203 |
+
# print(model.config)
|
204 |
+
# print(model.tokenizer)
|
205 |
+
# print(res)
|
206 |
+
# exit(0)
|
207 |
+
|
208 |
+
processor = VILAProcessor(
|
209 |
+
config=model.config,
|
210 |
+
image_processor=model.vision_tower.image_processor,
|
211 |
+
tokenizer=model.tokenizer,
|
212 |
+
)
|
213 |
+
|
214 |
+
# TODO: add padding, return_tensors,
|
215 |
+
inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
|
216 |
+
print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
|
217 |
+
print("vila conv pass")
|
218 |
+
|
219 |
+
inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
|
220 |
+
print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
|
221 |
+
print("gpt conv pass")
|
222 |
+
|
223 |
+
output_ids = model.generate(
|
224 |
+
input_ids=inputs.input_ids,
|
225 |
+
media={
|
226 |
+
"image": inputs.image,
|
227 |
+
},
|
228 |
+
media_config={
|
229 |
+
"image": {}
|
230 |
+
},
|
231 |
+
generation_config=model.generation_config,
|
232 |
+
max_new_tokens=100,
|
233 |
+
)
|
234 |
+
print(output_ids)
|
builder.py
CHANGED
@@ -229,7 +229,6 @@ def build_llm_and_tokenizer(
|
|
229 |
chat_template = fd.read()
|
230 |
tokenizer.chat_template = chat_template.replace(" ", "").replace("\n", "")
|
231 |
|
232 |
-
# NOTE(ligeng): disable temporarially, let see will any bugs introduce
|
233 |
# Set stop tokens for the tokenizer
|
234 |
tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
|
235 |
tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
|
|
|
229 |
chat_template = fd.read()
|
230 |
tokenizer.chat_template = chat_template.replace(" ", "").replace("\n", "")
|
231 |
|
|
|
232 |
# Set stop tokens for the tokenizer
|
233 |
tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
|
234 |
tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
|
config.json
CHANGED
@@ -269,6 +269,7 @@
|
|
269 |
},
|
270 |
"version": "2.0",
|
271 |
"auto_map": {
|
|
|
272 |
"AutoConfig": "modeling_vila.VILAConfig",
|
273 |
"AutoModel": "modeling_vila.VILAForCasualLM",
|
274 |
"AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"
|
|
|
269 |
},
|
270 |
"version": "2.0",
|
271 |
"auto_map": {
|
272 |
+
"AutoProcessor": "auto_processor.VILAProcessor",
|
273 |
"AutoConfig": "modeling_vila.VILAConfig",
|
274 |
"AutoModel": "modeling_vila.VILAForCasualLM",
|
275 |
"AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"
|
llm/added_tokens.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"<|endoftext|>": 151643,
|
3 |
"<|im_end|>": 151645,
|
4 |
"<|im_start|>": 151644,
|
|
|
1 |
{
|
2 |
+
"<image>": 151649,
|
3 |
+
"<vila/sentinel>": 151648,
|
4 |
+
"<vila/video>": 151650,
|
5 |
"<|endoftext|>": 151643,
|
6 |
"<|im_end|>": 151645,
|
7 |
"<|im_start|>": 151644,
|
llm/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7fc37d325d718c91319f527fbe8258c03ac890aba2f252b85af89a625927908a
|
3 |
+
size 11419189
|
llm/tokenizer_config.json
CHANGED
@@ -40,6 +40,30 @@
|
|
40 |
"rstrip": false,
|
41 |
"single_word": false,
|
42 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
}
|
44 |
},
|
45 |
"additional_special_tokens": [
|
|
|
40 |
"rstrip": false,
|
41 |
"single_word": false,
|
42 |
"special": true
|
43 |
+
},
|
44 |
+
"151648": {
|
45 |
+
"content": "<vila/sentinel>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false,
|
50 |
+
"special": true
|
51 |
+
},
|
52 |
+
"151649": {
|
53 |
+
"content": "<image>",
|
54 |
+
"lstrip": false,
|
55 |
+
"normalized": false,
|
56 |
+
"rstrip": false,
|
57 |
+
"single_word": false,
|
58 |
+
"special": true
|
59 |
+
},
|
60 |
+
"151650": {
|
61 |
+
"content": "<vila/video>",
|
62 |
+
"lstrip": false,
|
63 |
+
"normalized": false,
|
64 |
+
"rstrip": false,
|
65 |
+
"single_word": false,
|
66 |
+
"special": true
|
67 |
}
|
68 |
},
|
69 |
"additional_special_tokens": [
|
llm/vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
mm_utils.py
CHANGED
@@ -26,7 +26,7 @@ import torch
|
|
26 |
from PIL import Image
|
27 |
from transformers import StoppingCriteria
|
28 |
|
29 |
-
from .constants import DEFAULT_IMAGE_TOKEN
|
30 |
|
31 |
|
32 |
def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
|
|
|
26 |
from PIL import Image
|
27 |
from transformers import StoppingCriteria
|
28 |
|
29 |
+
from llava.constants import DEFAULT_IMAGE_TOKEN
|
30 |
|
31 |
|
32 |
def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
|
modeling_vila.py
CHANGED
@@ -48,8 +48,8 @@ from .media_encoder import BasicImageEncoder, BasicVideoEncoder
|
|
48 |
from .mm_utils import process_image, process_images
|
49 |
from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
|
50 |
from .tokenizer_utils import tokenize_conversation
|
51 |
-
from .utils import get_model_config
|
52 |
-
|
53 |
|
54 |
# from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
|
55 |
# quick hack for remote code
|
@@ -217,6 +217,7 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
217 |
output_dir: str = None,
|
218 |
vila_version: str | None = None,
|
219 |
conv_mode: str | None = None,
|
|
|
220 |
*model_args,
|
221 |
**kwargs,
|
222 |
):
|
@@ -225,8 +226,11 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
225 |
|
226 |
if os.path.isdir(model_path):
|
227 |
model_path = model_path
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
230 |
if check_dot_in_model_path(model_path) and output_dir is None:
|
231 |
raise ValueError(
|
232 |
f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
|
@@ -238,15 +242,12 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
238 |
if vila_version is None:
|
239 |
vila_version = get_vila_version(model_path)
|
240 |
|
241 |
-
if api.repo_exists(model_path):
|
242 |
-
model_path = snapshot_download(model_path, local_dir=output_dir)
|
243 |
-
print("downloading HF model to", model_path)
|
244 |
-
|
245 |
cfg_path = os.path.join(model_path, "config.json")
|
246 |
config = json.load(open(cfg_path))
|
247 |
config["version"] = "2.0" # nvila tag
|
248 |
config["architectures"] = ["VILAForCasualLM"]
|
249 |
config["auto_map"] = {
|
|
|
250 |
"AutoConfig": "modeling_vila.VILAConfig",
|
251 |
"AutoModel": "modeling_vila.VILAForCasualLM",
|
252 |
"AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
|
@@ -261,19 +262,44 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
261 |
with open(jinja_path, "w") as f:
|
262 |
f.write(jinja_template)
|
263 |
json.dump(config, open(cfg_path, "w"), indent=2)
|
264 |
-
self.copy_remote_py_files(model_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
@classmethod
|
267 |
-
def copy_remote_py_files(cls, output_dir):
|
268 |
## copy .py and REAMDE for next loading remote code
|
269 |
current_file_path = os.path.abspath(__file__)
|
270 |
current_folder = os.path.dirname(current_file_path)
|
271 |
for file_name in os.listdir(current_folder):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
if file_name.endswith(".py") or file_name.endswith(".jinja"):
|
273 |
full_file_name = os.path.join(current_folder, file_name)
|
274 |
if os.path.isfile(full_file_name):
|
275 |
-
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
def save_pretrained(self, output_dir, state_dict=None):
|
279 |
if state_dict is None:
|
@@ -358,7 +384,6 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
358 |
# XGrammar tokenizer and grammar compiler
|
359 |
# lazy init only when specified json output during inference
|
360 |
self.grammar_compiler = None
|
361 |
-
|
362 |
self.llm.resize_token_embeddings(len(self.tokenizer))
|
363 |
return self.llm, self.tokenizer
|
364 |
|
@@ -1077,6 +1102,13 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1077 |
# Set up the generation config
|
1078 |
generation_config = generation_config or self.default_generation_config
|
1079 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1080 |
# Generate the response
|
1081 |
try:
|
1082 |
output_ids = self.generate(
|
|
|
48 |
from .mm_utils import process_image, process_images
|
49 |
from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
|
50 |
from .tokenizer_utils import tokenize_conversation
|
51 |
+
from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
|
52 |
+
from .auto_processor import VILAProcessor
|
53 |
|
54 |
# from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
|
55 |
# quick hack for remote code
|
|
|
217 |
output_dir: str = None,
|
218 |
vila_version: str | None = None,
|
219 |
conv_mode: str | None = None,
|
220 |
+
copy: bool = True,
|
221 |
*model_args,
|
222 |
**kwargs,
|
223 |
):
|
|
|
226 |
|
227 |
if os.path.isdir(model_path):
|
228 |
model_path = model_path
|
229 |
+
else:
|
230 |
+
api = HfApi()
|
231 |
+
model_path = snapshot_download(model_path, local_dir=output_dir)
|
232 |
+
print("downloading HF model to", model_path)
|
233 |
+
|
234 |
if check_dot_in_model_path(model_path) and output_dir is None:
|
235 |
raise ValueError(
|
236 |
f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
|
|
|
242 |
if vila_version is None:
|
243 |
vila_version = get_vila_version(model_path)
|
244 |
|
|
|
|
|
|
|
|
|
245 |
cfg_path = os.path.join(model_path, "config.json")
|
246 |
config = json.load(open(cfg_path))
|
247 |
config["version"] = "2.0" # nvila tag
|
248 |
config["architectures"] = ["VILAForCasualLM"]
|
249 |
config["auto_map"] = {
|
250 |
+
"AutoProcessor": "auto_processor.VILAProcessor",
|
251 |
"AutoConfig": "modeling_vila.VILAConfig",
|
252 |
"AutoModel": "modeling_vila.VILAForCasualLM",
|
253 |
"AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
|
|
|
262 |
with open(jinja_path, "w") as f:
|
263 |
f.write(jinja_template)
|
264 |
json.dump(config, open(cfg_path, "w"), indent=2)
|
265 |
+
self.copy_remote_py_files(model_path, copy=copy)
|
266 |
+
|
267 |
+
##########################################################################################
|
268 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
269 |
+
tokenizer = load_tokenizer_then_handle_media_tokens_and_chat_template(model_path, config)
|
270 |
+
tokenizer.save_pretrained(osp.join(output_dir, "llm"))
|
271 |
+
##########################################################################################
|
272 |
|
273 |
@classmethod
|
274 |
+
def copy_remote_py_files(cls, output_dir, copy=True):
|
275 |
## copy .py and REAMDE for next loading remote code
|
276 |
current_file_path = os.path.abspath(__file__)
|
277 |
current_folder = os.path.dirname(current_file_path)
|
278 |
for file_name in os.listdir(current_folder):
|
279 |
+
if file_name == "INSTRUCTIONS.md":
|
280 |
+
src_fname = os.path.join(current_folder, file_name)
|
281 |
+
dst_fname = os.path.join(output_dir, "README.md")
|
282 |
+
if os.path.exists(dst_fname):
|
283 |
+
old_reamde = open(dst_fname, 'r').read()
|
284 |
+
else:
|
285 |
+
old_reamde = ""
|
286 |
+
with open(src_fname, 'r') as src, open(dst_fname, 'w') as dst:
|
287 |
+
dst.write(src.read())
|
288 |
+
dst.write(old_reamde)
|
289 |
+
print("[HF remote code] REAMDE ", src_fname, "to", dst_fname)
|
290 |
if file_name.endswith(".py") or file_name.endswith(".jinja"):
|
291 |
full_file_name = os.path.join(current_folder, file_name)
|
292 |
if os.path.isfile(full_file_name):
|
293 |
+
if copy:
|
294 |
+
shutil.copy(full_file_name, output_dir)
|
295 |
+
print("[HF remote code] copying", full_file_name, "to", output_dir)
|
296 |
+
else:
|
297 |
+
# symlink to ease development
|
298 |
+
if os.path.exists(os.path.join(output_dir, file_name)):
|
299 |
+
os.remove(os.path.join(output_dir, file_name))
|
300 |
+
os.symlink(full_file_name, os.path.join(output_dir, file_name))
|
301 |
+
print("[HF remote code] linking", full_file_name, "to", output_dir)
|
302 |
+
|
303 |
|
304 |
def save_pretrained(self, output_dir, state_dict=None):
|
305 |
if state_dict is None:
|
|
|
384 |
# XGrammar tokenizer and grammar compiler
|
385 |
# lazy init only when specified json output during inference
|
386 |
self.grammar_compiler = None
|
|
|
387 |
self.llm.resize_token_embeddings(len(self.tokenizer))
|
388 |
return self.llm, self.tokenizer
|
389 |
|
|
|
1102 |
# Set up the generation config
|
1103 |
generation_config = generation_config or self.default_generation_config
|
1104 |
|
1105 |
+
# print("input_ids", input_ids.shape)
|
1106 |
+
# print(input_ids)
|
1107 |
+
# print(self.tokenizer.batch_decode(input_ids))
|
1108 |
+
# print("media", {k: len(v) for k, v in media.items()})
|
1109 |
+
# print("media_config", media_config)
|
1110 |
+
# print("generation_config", generation_config)
|
1111 |
+
# input("wait for debug")
|
1112 |
# Generate the response
|
1113 |
try:
|
1114 |
output_ids = self.generate(
|
siglip_encoder.py
CHANGED
@@ -19,16 +19,12 @@ import torch.nn as nn
|
|
19 |
import torch.nn.functional as F
|
20 |
from accelerate.hooks import add_hook_to_module
|
21 |
from einops import rearrange
|
22 |
-
|
23 |
from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
|
24 |
from transformers.image_processing_utils import BaseImageProcessor
|
|
|
25 |
from transformers.models.siglip import SiglipVisionModel
|
26 |
|
27 |
-
from s2wrapper import forward as multiscale_forward
|
28 |
-
|
29 |
-
# from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
30 |
-
def is_deepspeed_zero3_enabled():
|
31 |
-
return False
|
32 |
|
33 |
class VisionTower(nn.Module):
|
34 |
def __init__(self, vision_tower, args, delay_load=False):
|
@@ -77,8 +73,10 @@ class VisionTower(nn.Module):
|
|
77 |
import torch.nn as nn
|
78 |
|
79 |
if is_deepspeed_zero3_enabled():
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
|
83 |
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
|
84 |
else:
|
|
|
19 |
import torch.nn.functional as F
|
20 |
from accelerate.hooks import add_hook_to_module
|
21 |
from einops import rearrange
|
22 |
+
from s2wrapper import forward as multiscale_forward
|
23 |
from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
|
24 |
from transformers.image_processing_utils import BaseImageProcessor
|
25 |
+
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
26 |
from transformers.models.siglip import SiglipVisionModel
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
class VisionTower(nn.Module):
|
30 |
def __init__(self, vision_tower, args, delay_load=False):
|
|
|
73 |
import torch.nn as nn
|
74 |
|
75 |
if is_deepspeed_zero3_enabled():
|
76 |
+
try:
|
77 |
+
import deepspeed
|
78 |
+
except ImportError:
|
79 |
+
raise ImportError("DeepSpeed is not installed. Please install it with `pip install deepspeed`.")
|
80 |
with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
|
81 |
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
|
82 |
else:
|
utils.py
CHANGED
@@ -19,8 +19,40 @@ import os.path as osp
|
|
19 |
|
20 |
from huggingface_hub import repo_exists, snapshot_download
|
21 |
from huggingface_hub.utils import HFValidationError, validate_repo_id
|
22 |
-
from transformers import AutoConfig, PretrainedConfig
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def get_model_config(config):
|
26 |
default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
|
|
|
19 |
|
20 |
from huggingface_hub import repo_exists, snapshot_download
|
21 |
from huggingface_hub.utils import HFValidationError, validate_repo_id
|
22 |
+
from transformers import AutoConfig, PretrainedConfig, AutoTokenizer
|
23 |
+
|
24 |
+
from .configuration_vila import VILAConfig
|
25 |
+
from .constants import MEDIA_TOKENS
|
26 |
+
from .tokenizer_utils import infer_stop_tokens
|
27 |
+
|
28 |
+
def load_tokenizer_then_handle_media_tokens_and_chat_template(model_name_or_path, config: VILAConfig, model_max_length=None):
|
29 |
+
# TODO(ligeng): a lot of copy-paste code, refactor to make a single function
|
30 |
+
tokenizer = AutoTokenizer.from_pretrained(osp.join(model_name_or_path, "llm"), padding_side="right", use_fast=True, legacy=False)
|
31 |
+
if model_max_length is not None:
|
32 |
+
tokenizer.model_max_length = model_max_length
|
33 |
+
|
34 |
+
# Load chat template if specified.
|
35 |
+
if getattr(config, "chat_template", None) is not None:
|
36 |
+
print(f"Using chat template: {config.chat_template}")
|
37 |
+
fpath = os.path.join(os.path.dirname(__file__), "chat_templates", f"{config.chat_template}.jinja")
|
38 |
+
if not os.path.exists(fpath):
|
39 |
+
fpath = os.path.join(os.path.dirname(model_name_or_path), f"{config.chat_template}.jinja")
|
40 |
+
with open(fpath) as fd:
|
41 |
+
chat_template = fd.read()
|
42 |
+
tokenizer.chat_template = chat_template.replace(" ", "").replace("\n", "")
|
43 |
+
|
44 |
+
# Set stop tokens for the tokenizer
|
45 |
+
tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
|
46 |
+
tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
|
47 |
+
|
48 |
+
# Add media tokens to the tokenizer
|
49 |
+
tokenizer.media_tokens = MEDIA_TOKENS
|
50 |
+
tokenizer.media_token_ids = {}
|
51 |
+
for name, token in MEDIA_TOKENS.items():
|
52 |
+
tokenizer.add_tokens([token], special_tokens=True)
|
53 |
+
tokenizer.media_token_ids[name] = tokenizer.convert_tokens_to_ids(token)
|
54 |
+
|
55 |
+
return tokenizer
|
56 |
|
57 |
def get_model_config(config):
|
58 |
default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
|