Ligeng-Zhu commited on
Commit
06f6212
·
verified ·
1 Parent(s): cc1340e

Upload files with `vila-upload`.

Browse files

Upload utils.py
Upload auto_processor.py
Upload siglip_encoder.py
Upload README.md
Upload mm_utils.py
Upload builder.py
Upload config.json
Upload modeling_vila.py
Upload llm/vocab.json
Upload llm/tokenizer_config.json
Upload llm/added_tokens.json
Upload llm/tokenizer.json

.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -2,8 +2,6 @@
2
  license: cc
3
  language:
4
  - en
5
- base_model:
6
- - Qwen/Qwen2.5-1.5B-Instruct
7
  ---
8
 
9
  Dependency setups:
@@ -13,6 +11,7 @@ pip install transformers==4.46 accelerate opencv-python torchvision einops
13
  pip install git+https://github.com/bfshi/scaling_on_scales.git
14
  ```
15
 
 
16
 
17
  ```python
18
  from transformers import AutoConfig, AutoModel
@@ -20,9 +19,13 @@ from termcolor import colored
20
 
21
  model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
22
 
23
- # config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
24
- # model = AutoModel.from_config(config, trust_remote_code=True)
 
 
25
  model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
 
 
26
  res = model.generate_content([
27
  "how are you today?"
28
  ])
@@ -30,10 +33,69 @@ print(colored(res, "cyan", attrs=["bold"]))
30
 
31
  print("---" * 40)
32
 
 
33
  import PIL.Image
34
  response = model.generate_content([
35
  PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
36
  "describe the image?"
37
  ])
38
  print(colored(response, "cyan", attrs=["bold"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ```
 
2
  license: cc
3
  language:
4
  - en
 
 
5
  ---
6
 
7
  Dependency setups:
 
11
  pip install git+https://github.com/bfshi/scaling_on_scales.git
12
  ```
13
 
14
+ ## Usage
15
 
16
  ```python
17
  from transformers import AutoConfig, AutoModel
 
19
 
20
  model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
21
 
22
+ # you can use config
23
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
24
+ model = AutoModel.from_config(config, trust_remote_code=True)
25
+ # or directly from_pretrained
26
  model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
27
+
28
+ # examples generate with raw text
29
  res = model.generate_content([
30
  "how are you today?"
31
  ])
 
33
 
34
  print("---" * 40)
35
 
36
+ # examples generate with text + image
37
  import PIL.Image
38
  response = model.generate_content([
39
  PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
40
  "describe the image?"
41
  ])
42
  print(colored(response, "cyan", attrs=["bold"]))
43
+ ```
44
+
45
+ ## AutoProcessor
46
+
47
+ we also support `AutoProcessor` class if you want to do finetune
48
+
49
+ ```python
50
+ from transformers import AutoProcessor, AutoModel
51
+
52
+ model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
53
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
54
+
55
+ gpt_conv = [ {
56
+ "role": "user",
57
+ "content": [
58
+ {"type": "image", "path": "demo_images/demo_img_1.png"},
59
+ {"type": "text", "text": "Describe this image."}
60
+ ]
61
+ }]
62
+
63
+ inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
64
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
65
+ output_ids = model.generate(
66
+ input_ids=inputs.input_ids,
67
+ media={
68
+ "image": inputs.image,
69
+ },
70
+ media_config={
71
+ "image": {}
72
+ },
73
+ generation_config=model.generation_config,
74
+ max_new_tokens=256,
75
+ )
76
+ print(processor.tokenizer.decode(output_ids[0], skip_special_tokens=True))
77
+
78
+ ##### the above code is equivalent to
79
+ # response = model.generate_content([
80
+ # PIL.Image.open("demo_images/demo_img_1.png"),
81
+ # "describe the image?"
82
+ # ])
83
+ # print(colored(response, "cyan", attrs=["bold"]))
84
+ ```
85
+
86
+ ## Model Convert
87
+
88
+ The follwing code converts a convetional NVILA model to a HF compatible model.
89
+
90
+ ```python
91
+ import os, os.path as osp
92
+ from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
93
+
94
+ model_path = "Efficient-Large-Model/NVILA-Lite-2B"
95
+ output_dir = "NVILA-Lite-2B-hf-preview"
96
+
97
+ if osp.isdir(output_dir):
98
+ shutil.rmtree(output_dir)
99
+ from llava.remote_code.modeling_vila import VILAForCasualLM
100
+ VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
101
  ```
auto_processor.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, os.path as osp
2
+ from collections import defaultdict
3
+ from typing import List, Union
4
+
5
+ from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoImageProcessor, AutoProcessor
6
+ from transformers.feature_extraction_utils import BatchFeature
7
+ from transformers.image_utils import ImageInput, VideoInput
8
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
9
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
10
+ from transformers.utils import logging
11
+
12
+ from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
13
+ from .media import Image, Video
14
+ from .mm_utils import process_image, process_images
15
+ from .media import extract_media
16
+ from .tokenizer_utils import tokenize_conversation
17
+
18
+
19
+ class VILAProcessorKwargs(ProcessingKwargs, total=False):
20
+ _defaults = {
21
+ "text_kwargs": {
22
+ "padding": False,
23
+ },
24
+ }
25
+
26
+
27
+ class VILAProcessor(ProcessorMixin):
28
+ # attributes = ["image_processor", "tokenizer"]
29
+ attributes = []
30
+ # valid_kwargs = ["chat_template"]
31
+ valid_kwargs = []
32
+ # image_processor_class = "VILAImageProcessor"
33
+ # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
34
+
35
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
36
+ # self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
37
+ # self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
38
+ self.image_token = MEDIA_TOKENS["image"]
39
+ self.video_token = MEDIA_TOKENS["video"]
40
+ self.config = config
41
+ self.image_processor = image_processor
42
+ self.tokenizer = tokenizer
43
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
44
+
45
+ @classmethod
46
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
47
+ if os.path.isdir(pretrained_model_name_or_path):
48
+ pretrained_model_name_or_path = pretrained_model_name_or_path
49
+ else:
50
+ print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
51
+ from huggingface_hub import HfApi, snapshot_download
52
+ pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
53
+
54
+ image_processor = AutoImageProcessor.from_pretrained(osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True)
55
+ tokenizer = AutoTokenizer.from_pretrained(osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True)
56
+ config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
57
+
58
+ return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
59
+
60
+ def __repr__(self):
61
+ return f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
62
+
63
+ def __call__(
64
+ self,
65
+ conversation,
66
+ images: ImageInput = None,
67
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
68
+ videos: VideoInput = None,
69
+ **kwargs: Unpack[VILAProcessorKwargs],
70
+ ) -> BatchFeature:
71
+ # TODO: should be merged with llava_arch.py/generate_content()
72
+ # TODO (extract and preprocess should be done together, as the preprocess of image and video can be different, i.e. when dynamic res is used)
73
+ media = extract_media(conversation, self.config)
74
+ # Process media
75
+ media_config = defaultdict(dict)
76
+ for name in media:
77
+ if name == "image":
78
+ if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
79
+ self.config.image_processor = self.image_processor
80
+ if self.config.image_aspect_ratio == "dynamic":
81
+ images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
82
+ conversation[0]["value"] = conversation[0]["value"].replace(
83
+ DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
84
+ )
85
+ else:
86
+ if type(self.config.s2_scales) is str:
87
+ self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
88
+ images, block_sizes = process_image(
89
+ media["image"][0], self.config, None, enable_dynamic_s2=True
90
+ )
91
+ images = images.half()
92
+ media_config[name]["block_sizes"] = [block_sizes]
93
+ else:
94
+ images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
95
+ media[name] = [image for image in images]
96
+ elif name == "video":
97
+ media[name] = [
98
+ process_images(images, self.vision_tower.image_processor, self.config).half()
99
+ for images in media[name]
100
+ ]
101
+ else:
102
+ raise ValueError(f"Unsupported media type: {name}")
103
+
104
+ input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
105
+ # Set up the generation config
106
+ # print(input_ids.shape); print(media); input()
107
+ return BatchFeature(data={"input_ids": input_ids, **media})
108
+
109
+ def batch_decode(self, *args, **kwargs):
110
+ """
111
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
112
+ refer to the docstring of this method for more information.
113
+ """
114
+ return self.tokenizer.batch_decode(*args, **kwargs)
115
+
116
+ def decode(self, *args, **kwargs):
117
+ """
118
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
119
+ the docstring of this method for more information.
120
+ """
121
+ return self.tokenizer.decode(*args, **kwargs)
122
+
123
+ def post_process_image_text_to_text(self, generated_outputs):
124
+ """
125
+ Post-process the output of the model to decode the text.
126
+
127
+ Args:
128
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
129
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
130
+ or `(sequence_length,)`.
131
+
132
+ Returns:
133
+ `List[str]`: The decoded text.
134
+ """
135
+ return self.tokenizer.batch_decode(
136
+ generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
137
+ )
138
+
139
+ @property
140
+ def model_input_names(self):
141
+ tokenizer_input_names = self.tokenizer.model_input_names
142
+ image_processor_input_names = self.image_processor.model_input_names
143
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
144
+
145
+ # inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
146
+ def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
147
+ vila_conv = []
148
+
149
+ for chat in conversation:
150
+ vila_chat = {
151
+ "from": "",
152
+ "value": []
153
+ }
154
+ if chat["role"] == "user":
155
+ # user allows to input image and text
156
+ vila_chat["from"] = "human"
157
+ for content in chat["content"]:
158
+ if content["type"] == "image":
159
+ vila_chat["value"].append(Image(content["path"]))
160
+ elif content["type"] == "text":
161
+ vila_chat["value"].append(content["text"])
162
+ else:
163
+ raise ValueError(f"Unsupported content type: {content['type']}")
164
+ elif chat["role"] == "assistant":
165
+ vila_chat["from"] = "gpt"
166
+ for content in chat["content"]:
167
+ assert content["type"] == "text", f"Unsupported content type: {content['type']}"
168
+ vila_chat["value"].append(content["text"])
169
+ vila_conv.append(vila_chat)
170
+
171
+ return self(vila_conv)
172
+
173
+ if __name__ == "__main__":
174
+ # gpt style: user, assistant
175
+ # vila style: human, gpt
176
+ gpt_conv = [
177
+ {
178
+ "role": "user",
179
+ "content": [
180
+ {"type": "image", "path": "demo_images/demo_img_1.png"},
181
+ {"type": "text", "text": "Describe this image."}
182
+ ]
183
+ }
184
+ ]
185
+
186
+ llavaconv = [
187
+ {
188
+ "from": "human",
189
+ "value": [
190
+ PIL.Image.open("demo_images/demo_img_1.png"),
191
+ "Describe this image.",
192
+ ],
193
+ }
194
+ ]
195
+
196
+ processor = AutoProcessor.from_pretrained(output_dir, trust_remote_code=True)
197
+ inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
198
+ # model = llava.load("Efficient-Large-Model/qwen25_2B_3x3-sft").cuda()
199
+ # print(model)
200
+ model_path = "NVILA-Lite-2B-hf-preview"
201
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
202
+ # res = model.generate_content(["how are you today?"])
203
+ # print(model.config)
204
+ # print(model.tokenizer)
205
+ # print(res)
206
+ # exit(0)
207
+
208
+ processor = VILAProcessor(
209
+ config=model.config,
210
+ image_processor=model.vision_tower.image_processor,
211
+ tokenizer=model.tokenizer,
212
+ )
213
+
214
+ # TODO: add padding, return_tensors,
215
+ inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
216
+ print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
217
+ print("vila conv pass")
218
+
219
+ inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
220
+ print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
221
+ print("gpt conv pass")
222
+
223
+ output_ids = model.generate(
224
+ input_ids=inputs.input_ids,
225
+ media={
226
+ "image": inputs.image,
227
+ },
228
+ media_config={
229
+ "image": {}
230
+ },
231
+ generation_config=model.generation_config,
232
+ max_new_tokens=100,
233
+ )
234
+ print(output_ids)
builder.py CHANGED
@@ -229,7 +229,6 @@ def build_llm_and_tokenizer(
229
  chat_template = fd.read()
230
  tokenizer.chat_template = chat_template.replace(" ", "").replace("\n", "")
231
 
232
- # NOTE(ligeng): disable temporarially, let see will any bugs introduce
233
  # Set stop tokens for the tokenizer
234
  tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
235
  tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
 
229
  chat_template = fd.read()
230
  tokenizer.chat_template = chat_template.replace(" ", "").replace("\n", "")
231
 
 
232
  # Set stop tokens for the tokenizer
233
  tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
234
  tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
config.json CHANGED
@@ -269,6 +269,7 @@
269
  },
270
  "version": "2.0",
271
  "auto_map": {
 
272
  "AutoConfig": "modeling_vila.VILAConfig",
273
  "AutoModel": "modeling_vila.VILAForCasualLM",
274
  "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"
 
269
  },
270
  "version": "2.0",
271
  "auto_map": {
272
+ "AutoProcessor": "auto_processor.VILAProcessor",
273
  "AutoConfig": "modeling_vila.VILAConfig",
274
  "AutoModel": "modeling_vila.VILAForCasualLM",
275
  "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"
llm/added_tokens.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "<|endoftext|>": 151643,
3
  "<|im_end|>": 151645,
4
  "<|im_start|>": 151644,
 
1
  {
2
+ "<image>": 151649,
3
+ "<vila/sentinel>": 151648,
4
+ "<vila/video>": 151650,
5
  "<|endoftext|>": 151643,
6
  "<|im_end|>": 151645,
7
  "<|im_start|>": 151644,
llm/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc37d325d718c91319f527fbe8258c03ac890aba2f252b85af89a625927908a
3
+ size 11419189
llm/tokenizer_config.json CHANGED
@@ -40,6 +40,30 @@
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
  },
45
  "additional_special_tokens": [
 
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<vila/sentinel>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<image>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<vila/video>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
  }
68
  },
69
  "additional_special_tokens": [
llm/vocab.json CHANGED
The diff for this file is too large to render. See raw diff
 
mm_utils.py CHANGED
@@ -26,7 +26,7 @@ import torch
26
  from PIL import Image
27
  from transformers import StoppingCriteria
28
 
29
- from .constants import DEFAULT_IMAGE_TOKEN
30
 
31
 
32
  def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
 
26
  from PIL import Image
27
  from transformers import StoppingCriteria
28
 
29
+ from llava.constants import DEFAULT_IMAGE_TOKEN
30
 
31
 
32
  def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
modeling_vila.py CHANGED
@@ -48,8 +48,8 @@ from .media_encoder import BasicImageEncoder, BasicVideoEncoder
48
  from .mm_utils import process_image, process_images
49
  from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
50
  from .tokenizer_utils import tokenize_conversation
51
- from .utils import get_model_config
52
-
53
 
54
  # from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
55
  # quick hack for remote code
@@ -217,6 +217,7 @@ class VILAPretrainedModel(PreTrainedModel):
217
  output_dir: str = None,
218
  vila_version: str | None = None,
219
  conv_mode: str | None = None,
 
220
  *model_args,
221
  **kwargs,
222
  ):
@@ -225,8 +226,11 @@ class VILAPretrainedModel(PreTrainedModel):
225
 
226
  if os.path.isdir(model_path):
227
  model_path = model_path
228
- api = HfApi()
229
-
 
 
 
230
  if check_dot_in_model_path(model_path) and output_dir is None:
231
  raise ValueError(
232
  f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
@@ -238,15 +242,12 @@ class VILAPretrainedModel(PreTrainedModel):
238
  if vila_version is None:
239
  vila_version = get_vila_version(model_path)
240
 
241
- if api.repo_exists(model_path):
242
- model_path = snapshot_download(model_path, local_dir=output_dir)
243
- print("downloading HF model to", model_path)
244
-
245
  cfg_path = os.path.join(model_path, "config.json")
246
  config = json.load(open(cfg_path))
247
  config["version"] = "2.0" # nvila tag
248
  config["architectures"] = ["VILAForCasualLM"]
249
  config["auto_map"] = {
 
250
  "AutoConfig": "modeling_vila.VILAConfig",
251
  "AutoModel": "modeling_vila.VILAForCasualLM",
252
  "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
@@ -261,19 +262,44 @@ class VILAPretrainedModel(PreTrainedModel):
261
  with open(jinja_path, "w") as f:
262
  f.write(jinja_template)
263
  json.dump(config, open(cfg_path, "w"), indent=2)
264
- self.copy_remote_py_files(model_path)
 
 
 
 
 
 
265
 
266
  @classmethod
267
- def copy_remote_py_files(cls, output_dir):
268
  ## copy .py and REAMDE for next loading remote code
269
  current_file_path = os.path.abspath(__file__)
270
  current_folder = os.path.dirname(current_file_path)
271
  for file_name in os.listdir(current_folder):
 
 
 
 
 
 
 
 
 
 
 
272
  if file_name.endswith(".py") or file_name.endswith(".jinja"):
273
  full_file_name = os.path.join(current_folder, file_name)
274
  if os.path.isfile(full_file_name):
275
- shutil.copy(full_file_name, output_dir)
276
- print("[HF remote code] copying", full_file_name, "to", output_dir)
 
 
 
 
 
 
 
 
277
 
278
  def save_pretrained(self, output_dir, state_dict=None):
279
  if state_dict is None:
@@ -358,7 +384,6 @@ class VILAPretrainedModel(PreTrainedModel):
358
  # XGrammar tokenizer and grammar compiler
359
  # lazy init only when specified json output during inference
360
  self.grammar_compiler = None
361
-
362
  self.llm.resize_token_embeddings(len(self.tokenizer))
363
  return self.llm, self.tokenizer
364
 
@@ -1077,6 +1102,13 @@ class VILAForCasualLM(VILAPretrainedModel):
1077
  # Set up the generation config
1078
  generation_config = generation_config or self.default_generation_config
1079
 
 
 
 
 
 
 
 
1080
  # Generate the response
1081
  try:
1082
  output_ids = self.generate(
 
48
  from .mm_utils import process_image, process_images
49
  from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
50
  from .tokenizer_utils import tokenize_conversation
51
+ from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
52
+ from .auto_processor import VILAProcessor
53
 
54
  # from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
55
  # quick hack for remote code
 
217
  output_dir: str = None,
218
  vila_version: str | None = None,
219
  conv_mode: str | None = None,
220
+ copy: bool = True,
221
  *model_args,
222
  **kwargs,
223
  ):
 
226
 
227
  if os.path.isdir(model_path):
228
  model_path = model_path
229
+ else:
230
+ api = HfApi()
231
+ model_path = snapshot_download(model_path, local_dir=output_dir)
232
+ print("downloading HF model to", model_path)
233
+
234
  if check_dot_in_model_path(model_path) and output_dir is None:
235
  raise ValueError(
236
  f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
 
242
  if vila_version is None:
243
  vila_version = get_vila_version(model_path)
244
 
 
 
 
 
245
  cfg_path = os.path.join(model_path, "config.json")
246
  config = json.load(open(cfg_path))
247
  config["version"] = "2.0" # nvila tag
248
  config["architectures"] = ["VILAForCasualLM"]
249
  config["auto_map"] = {
250
+ "AutoProcessor": "auto_processor.VILAProcessor",
251
  "AutoConfig": "modeling_vila.VILAConfig",
252
  "AutoModel": "modeling_vila.VILAForCasualLM",
253
  "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
 
262
  with open(jinja_path, "w") as f:
263
  f.write(jinja_template)
264
  json.dump(config, open(cfg_path, "w"), indent=2)
265
+ self.copy_remote_py_files(model_path, copy=copy)
266
+
267
+ ##########################################################################################
268
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
269
+ tokenizer = load_tokenizer_then_handle_media_tokens_and_chat_template(model_path, config)
270
+ tokenizer.save_pretrained(osp.join(output_dir, "llm"))
271
+ ##########################################################################################
272
 
273
  @classmethod
274
+ def copy_remote_py_files(cls, output_dir, copy=True):
275
  ## copy .py and REAMDE for next loading remote code
276
  current_file_path = os.path.abspath(__file__)
277
  current_folder = os.path.dirname(current_file_path)
278
  for file_name in os.listdir(current_folder):
279
+ if file_name == "INSTRUCTIONS.md":
280
+ src_fname = os.path.join(current_folder, file_name)
281
+ dst_fname = os.path.join(output_dir, "README.md")
282
+ if os.path.exists(dst_fname):
283
+ old_reamde = open(dst_fname, 'r').read()
284
+ else:
285
+ old_reamde = ""
286
+ with open(src_fname, 'r') as src, open(dst_fname, 'w') as dst:
287
+ dst.write(src.read())
288
+ dst.write(old_reamde)
289
+ print("[HF remote code] REAMDE ", src_fname, "to", dst_fname)
290
  if file_name.endswith(".py") or file_name.endswith(".jinja"):
291
  full_file_name = os.path.join(current_folder, file_name)
292
  if os.path.isfile(full_file_name):
293
+ if copy:
294
+ shutil.copy(full_file_name, output_dir)
295
+ print("[HF remote code] copying", full_file_name, "to", output_dir)
296
+ else:
297
+ # symlink to ease development
298
+ if os.path.exists(os.path.join(output_dir, file_name)):
299
+ os.remove(os.path.join(output_dir, file_name))
300
+ os.symlink(full_file_name, os.path.join(output_dir, file_name))
301
+ print("[HF remote code] linking", full_file_name, "to", output_dir)
302
+
303
 
304
  def save_pretrained(self, output_dir, state_dict=None):
305
  if state_dict is None:
 
384
  # XGrammar tokenizer and grammar compiler
385
  # lazy init only when specified json output during inference
386
  self.grammar_compiler = None
 
387
  self.llm.resize_token_embeddings(len(self.tokenizer))
388
  return self.llm, self.tokenizer
389
 
 
1102
  # Set up the generation config
1103
  generation_config = generation_config or self.default_generation_config
1104
 
1105
+ # print("input_ids", input_ids.shape)
1106
+ # print(input_ids)
1107
+ # print(self.tokenizer.batch_decode(input_ids))
1108
+ # print("media", {k: len(v) for k, v in media.items()})
1109
+ # print("media_config", media_config)
1110
+ # print("generation_config", generation_config)
1111
+ # input("wait for debug")
1112
  # Generate the response
1113
  try:
1114
  output_ids = self.generate(
siglip_encoder.py CHANGED
@@ -19,16 +19,12 @@ import torch.nn as nn
19
  import torch.nn.functional as F
20
  from accelerate.hooks import add_hook_to_module
21
  from einops import rearrange
22
-
23
  from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
24
  from transformers.image_processing_utils import BaseImageProcessor
 
25
  from transformers.models.siglip import SiglipVisionModel
26
 
27
- from s2wrapper import forward as multiscale_forward
28
-
29
- # from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
30
- def is_deepspeed_zero3_enabled():
31
- return False
32
 
33
  class VisionTower(nn.Module):
34
  def __init__(self, vision_tower, args, delay_load=False):
@@ -77,8 +73,10 @@ class VisionTower(nn.Module):
77
  import torch.nn as nn
78
 
79
  if is_deepspeed_zero3_enabled():
80
- import deepspeed
81
-
 
 
82
  with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
83
  old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
84
  else:
 
19
  import torch.nn.functional as F
20
  from accelerate.hooks import add_hook_to_module
21
  from einops import rearrange
22
+ from s2wrapper import forward as multiscale_forward
23
  from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
24
  from transformers.image_processing_utils import BaseImageProcessor
25
+ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
26
  from transformers.models.siglip import SiglipVisionModel
27
 
 
 
 
 
 
28
 
29
  class VisionTower(nn.Module):
30
  def __init__(self, vision_tower, args, delay_load=False):
 
73
  import torch.nn as nn
74
 
75
  if is_deepspeed_zero3_enabled():
76
+ try:
77
+ import deepspeed
78
+ except ImportError:
79
+ raise ImportError("DeepSpeed is not installed. Please install it with `pip install deepspeed`.")
80
  with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
81
  old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
82
  else:
utils.py CHANGED
@@ -19,8 +19,40 @@ import os.path as osp
19
 
20
  from huggingface_hub import repo_exists, snapshot_download
21
  from huggingface_hub.utils import HFValidationError, validate_repo_id
22
- from transformers import AutoConfig, PretrainedConfig
23
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def get_model_config(config):
26
  default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
 
19
 
20
  from huggingface_hub import repo_exists, snapshot_download
21
  from huggingface_hub.utils import HFValidationError, validate_repo_id
22
+ from transformers import AutoConfig, PretrainedConfig, AutoTokenizer
23
+
24
+ from .configuration_vila import VILAConfig
25
+ from .constants import MEDIA_TOKENS
26
+ from .tokenizer_utils import infer_stop_tokens
27
+
28
+ def load_tokenizer_then_handle_media_tokens_and_chat_template(model_name_or_path, config: VILAConfig, model_max_length=None):
29
+ # TODO(ligeng): a lot of copy-paste code, refactor to make a single function
30
+ tokenizer = AutoTokenizer.from_pretrained(osp.join(model_name_or_path, "llm"), padding_side="right", use_fast=True, legacy=False)
31
+ if model_max_length is not None:
32
+ tokenizer.model_max_length = model_max_length
33
+
34
+ # Load chat template if specified.
35
+ if getattr(config, "chat_template", None) is not None:
36
+ print(f"Using chat template: {config.chat_template}")
37
+ fpath = os.path.join(os.path.dirname(__file__), "chat_templates", f"{config.chat_template}.jinja")
38
+ if not os.path.exists(fpath):
39
+ fpath = os.path.join(os.path.dirname(model_name_or_path), f"{config.chat_template}.jinja")
40
+ with open(fpath) as fd:
41
+ chat_template = fd.read()
42
+ tokenizer.chat_template = chat_template.replace(" ", "").replace("\n", "")
43
+
44
+ # Set stop tokens for the tokenizer
45
+ tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
46
+ tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
47
+
48
+ # Add media tokens to the tokenizer
49
+ tokenizer.media_tokens = MEDIA_TOKENS
50
+ tokenizer.media_token_ids = {}
51
+ for name, token in MEDIA_TOKENS.items():
52
+ tokenizer.add_tokens([token], special_tokens=True)
53
+ tokenizer.media_token_ids[name] = tokenizer.convert_tokens_to_ids(token)
54
+
55
+ return tokenizer
56
 
57
  def get_model_config(config):
58
  default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]