Upload files with `vila-upload`.

Upload utils.py
Upload auto_processor.py
Upload siglip_encoder.py
Upload README.md
Upload mm_utils.py
Upload builder.py
Upload config.json
Upload modeling_vila.py
Upload llm/vocab.json
Upload llm/tokenizer_config.json
Upload llm/added_tokens.json
Upload llm/tokenizer.json

Files changed (13) hide show

.gitattributes +1 -0
README.md +66 -4
auto_processor.py +234 -0
builder.py +0 -1
config.json +1 -0
llm/added_tokens.json +3 -0
llm/tokenizer.json +3 -0
llm/tokenizer_config.json +24 -0
llm/vocab.json +0 -0
mm_utils.py +1 -1
modeling_vila.py +45 -13
siglip_encoder.py +6 -8
utils.py +34 -2

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -2,8 +2,6 @@
 license: cc
 language:
 - en
-base_model:
-- Qwen/Qwen2.5-1.5B-Instruct
 ---
 Dependency setups:
@@ -13,6 +11,7 @@ pip install transformers==4.46 accelerate opencv-python torchvision einops
 pip install git+https://github.com/bfshi/scaling_on_scales.git
 ```
 ```python
 from transformers import AutoConfig, AutoModel
@@ -20,9 +19,13 @@ from termcolor import colored
 model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
-# config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-# model = AutoModel.from_config(config, trust_remote_code=True)
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
 res = model.generate_content([
     "how are you today?"
 ])
@@ -30,10 +33,69 @@ print(colored(res, "cyan", attrs=["bold"]))
 print("---" * 40)
 import PIL.Image
 response = model.generate_content([
     PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
     "describe the image?"
 ])
 print(colored(response, "cyan", attrs=["bold"]))
 ```

 license: cc
 language:
 - en
 ---
 Dependency setups:
 pip install git+https://github.com/bfshi/scaling_on_scales.git
 ```
+## Usage
 ```python
 from transformers import AutoConfig, AutoModel
 model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
+# you can use config
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_config(config, trust_remote_code=True)
+# or directly from_pretrained
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
+# examples generate with raw text
 res = model.generate_content([
     "how are you today?"
 ])
 print("---" * 40)
+# examples generate with text + image
 import PIL.Image
 response = model.generate_content([
     PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
     "describe the image?"
 ])
 print(colored(response, "cyan", attrs=["bold"]))
+```
+## AutoProcessor
+we also support `AutoProcessor` class if you want to do finetune
+```python
+from transformers import AutoProcessor, AutoModel
+model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+gpt_conv = [ {
+    "role": "user",
+    "content": [
+        {"type": "image", "path": "demo_images/demo_img_1.png"},
+        {"type": "text", "text": "Describe this image."}
+    ]
+}]
+inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
+model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
+output_ids = model.generate(
+    input_ids=inputs.input_ids,
+    media={
+        "image": inputs.image,
+    },
+    media_config={
+        "image": {}
+    },
+    generation_config=model.generation_config,
+    max_new_tokens=256,
+)
+print(processor.tokenizer.decode(output_ids[0], skip_special_tokens=True))
+##### the above code is equivalent to
+# response = model.generate_content([
+#     PIL.Image.open("demo_images/demo_img_1.png"),
+#     "describe the image?"
+# ])
+# print(colored(response, "cyan", attrs=["bold"]))
+```
+## Model Convert
+The follwing code converts a convetional NVILA model to a HF compatible model.
+```python
+import os, os.path as osp
+from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
+model_path = "Efficient-Large-Model/NVILA-Lite-2B"
+output_dir = "NVILA-Lite-2B-hf-preview"
+if osp.isdir(output_dir):
+    shutil.rmtree(output_dir)
+from llava.remote_code.modeling_vila import VILAForCasualLM
+VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
 ```

auto_processor.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os, os.path as osp
+from collections import defaultdict
+from typing import List, Union
+from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoImageProcessor, AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
+from .media import Image, Video
+from .mm_utils import process_image, process_images
+from .media import extract_media
+from .tokenizer_utils import tokenize_conversation
+class VILAProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class VILAProcessor(ProcessorMixin):
+    # attributes = ["image_processor", "tokenizer"]
+    attributes = []
+    # valid_kwargs = ["chat_template"]
+    valid_kwargs = []
+    # image_processor_class = "VILAImageProcessor"
+    # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
+        # self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        # self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token = MEDIA_TOKENS["image"]
+        self.video_token = MEDIA_TOKENS["video"]
+        self.config = config
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        if os.path.isdir(pretrained_model_name_or_path):
+            pretrained_model_name_or_path = pretrained_model_name_or_path
+        else:
+            print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
+            from huggingface_hub import HfApi, snapshot_download
+            pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
+        image_processor = AutoImageProcessor.from_pretrained(osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True)
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
+    def __repr__(self):
+        return f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
+    def __call__(
+        self,
+        conversation,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> BatchFeature:
+        # TODO: should be merged with llava_arch.py/generate_content()
+        # TODO (extract and preprocess should be done together, as the preprocess of image and video can be different, i.e. when dynamic res is used)
+        media = extract_media(conversation, self.config)
+        # Process media
+        media_config = defaultdict(dict)
+        for name in media:
+            if name == "image":
+                if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
+                    self.config.image_processor = self.image_processor
+                    if self.config.image_aspect_ratio == "dynamic":
+                        images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
+                        conversation[0]["value"] = conversation[0]["value"].replace(
+                            DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
+                        )
+                    else:
+                        if type(self.config.s2_scales) is str:
+                            self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                        images, block_sizes = process_image(
+                            media["image"][0], self.config, None, enable_dynamic_s2=True
+                        )
+                        images = images.half()
+                        media_config[name]["block_sizes"] = [block_sizes]
+                else:
+                    images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
+                media[name] = [image for image in images]
+            elif name == "video":
+                media[name] = [
+                    process_images(images, self.vision_tower.image_processor, self.config).half()
+                    for images in media[name]
+                ]
+            else:
+                raise ValueError(f"Unsupported media type: {name}")
+        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
+        # Set up the generation config
+        # print(input_ids.shape); print(media); input()
+        return BatchFeature(data={"input_ids": input_ids, **media})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    #     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
+    def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
+        vila_conv = []
+        for chat in conversation:
+            vila_chat = {
+                "from": "",
+                "value": []
+            }
+            if chat["role"] == "user":
+                # user allows to input image and text
+                vila_chat["from"] = "human"
+                for content in chat["content"]:
+                    if content["type"] == "image":
+                        vila_chat["value"].append(Image(content["path"]))
+                    elif content["type"] == "text":
+                        vila_chat["value"].append(content["text"])
+                    else:
+                        raise ValueError(f"Unsupported content type: {content['type']}")
+            elif chat["role"] == "assistant":
+                vila_chat["from"] = "gpt"
+                for content in chat["content"]:
+                    assert content["type"] == "text", f"Unsupported content type: {content['type']}"
+                    vila_chat["value"].append(content["text"])
+            vila_conv.append(vila_chat)
+        return self(vila_conv)
+if __name__ == "__main__":
+    # gpt style: user, assistant
+    # vila style: human, gpt
+    gpt_conv = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "path": "demo_images/demo_img_1.png"},
+                {"type": "text", "text": "Describe this image."}
+            ]
+        }
+    ]
+    llavaconv = [
+        {
+            "from": "human",
+            "value": [
+                PIL.Image.open("demo_images/demo_img_1.png"),
+                "Describe this image.",
+            ],
+        }
+    ]
+    processor = AutoProcessor.from_pretrained(output_dir, trust_remote_code=True)
+    inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
+    # model = llava.load("Efficient-Large-Model/qwen25_2B_3x3-sft").cuda()
+    # print(model)
+    model_path = "NVILA-Lite-2B-hf-preview"
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
+    # res = model.generate_content(["how are you today?"])
+    # print(model.config)
+    # print(model.tokenizer)
+    # print(res)
+    # exit(0)
+    processor = VILAProcessor(
+        config=model.config,
+        image_processor=model.vision_tower.image_processor,
+        tokenizer=model.tokenizer,
+    )
+    # TODO: add padding, return_tensors,
+    inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
+    print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
+    print("vila conv pass")
+    inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
+    print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
+    print("gpt conv pass")
+    output_ids = model.generate(
+        input_ids=inputs.input_ids,
+        media={
+            "image": inputs.image,
+        },
+        media_config={
+            "image": {}
+        },
+        generation_config=model.generation_config,
+        max_new_tokens=100,
+    )
+    print(output_ids)

builder.py CHANGED Viewed

@@ -229,7 +229,6 @@ def build_llm_and_tokenizer(
             chat_template = fd.read()
         tokenizer.chat_template = chat_template.replace("    ", "").replace("\n", "")
-    # NOTE(ligeng): disable temporarially, let see will any bugs introduce
     # Set stop tokens for the tokenizer
     tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
     tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)

             chat_template = fd.read()
         tokenizer.chat_template = chat_template.replace("    ", "").replace("\n", "")
     # Set stop tokens for the tokenizer
     tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
     tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)

config.json CHANGED Viewed

@@ -269,6 +269,7 @@
   },
   "version": "2.0",
   "auto_map": {
     "AutoConfig": "modeling_vila.VILAConfig",
     "AutoModel": "modeling_vila.VILAForCasualLM",
     "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"

   },
   "version": "2.0",
   "auto_map": {
+    "AutoProcessor": "auto_processor.VILAProcessor",
     "AutoConfig": "modeling_vila.VILAConfig",
     "AutoModel": "modeling_vila.VILAForCasualLM",
     "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"

llm/added_tokens.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
   "<|endoftext|>": 151643,
   "<|im_end|>": 151645,
   "<|im_start|>": 151644,

 {
+  "<image>": 151649,
+  "<vila/sentinel>": 151648,
+  "<vila/video>": 151650,
   "<|endoftext|>": 151643,
   "<|im_end|>": 151645,
   "<|im_start|>": 151644,

llm/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fc37d325d718c91319f527fbe8258c03ac890aba2f252b85af89a625927908a
+size 11419189

llm/tokenizer_config.json CHANGED Viewed

@@ -40,6 +40,30 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "151648": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [

llm/vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

mm_utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ import torch
 from PIL import Image
 from transformers import StoppingCriteria
-from .constants import DEFAULT_IMAGE_TOKEN
 def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):

 from PIL import Image
 from transformers import StoppingCriteria
+from llava.constants import DEFAULT_IMAGE_TOKEN
 def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):

modeling_vila.py CHANGED Viewed

@@ -48,8 +48,8 @@ from .media_encoder import BasicImageEncoder, BasicVideoEncoder
 from .mm_utils import process_image, process_images
 from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
 from .tokenizer_utils import tokenize_conversation
-from .utils import get_model_config
 # from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
 # quick hack for remote code
@@ -217,6 +217,7 @@ class VILAPretrainedModel(PreTrainedModel):
         output_dir: str = None,
         vila_version: str | None = None,
         conv_mode: str | None = None,
         *model_args,
         **kwargs,
     ):
@@ -225,8 +226,11 @@ class VILAPretrainedModel(PreTrainedModel):
         if os.path.isdir(model_path):
             model_path = model_path
-        api = HfApi()
         if check_dot_in_model_path(model_path) and output_dir is None:
             raise ValueError(
                 f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
@@ -238,15 +242,12 @@ class VILAPretrainedModel(PreTrainedModel):
         if vila_version is None:
             vila_version = get_vila_version(model_path)
-        if api.repo_exists(model_path):
-            model_path = snapshot_download(model_path, local_dir=output_dir)
-            print("downloading HF model to", model_path)
         cfg_path = os.path.join(model_path, "config.json")
         config = json.load(open(cfg_path))
         config["version"] = "2.0"  # nvila tag
         config["architectures"] = ["VILAForCasualLM"]
         config["auto_map"] = {
             "AutoConfig": "modeling_vila.VILAConfig",
             "AutoModel": "modeling_vila.VILAForCasualLM",
             "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
@@ -261,19 +262,44 @@ class VILAPretrainedModel(PreTrainedModel):
             with open(jinja_path, "w") as f:
                 f.write(jinja_template)
         json.dump(config, open(cfg_path, "w"), indent=2)
-        self.copy_remote_py_files(model_path)
     @classmethod
-    def copy_remote_py_files(cls, output_dir):
         ## copy .py and REAMDE for next loading remote code
         current_file_path = os.path.abspath(__file__)
         current_folder = os.path.dirname(current_file_path)
         for file_name in os.listdir(current_folder):
             if file_name.endswith(".py") or file_name.endswith(".jinja"):
                 full_file_name = os.path.join(current_folder, file_name)
                 if os.path.isfile(full_file_name):
-                    shutil.copy(full_file_name, output_dir)
-                    print("[HF remote code] copying", full_file_name, "to", output_dir)
     def save_pretrained(self, output_dir, state_dict=None):
         if state_dict is None:
@@ -358,7 +384,6 @@ class VILAPretrainedModel(PreTrainedModel):
         # XGrammar tokenizer and grammar compiler
         # lazy init only when specified json output during inference
         self.grammar_compiler = None
         self.llm.resize_token_embeddings(len(self.tokenizer))
         return self.llm, self.tokenizer
@@ -1077,6 +1102,13 @@ class VILAForCasualLM(VILAPretrainedModel):
         # Set up the generation config
         generation_config = generation_config or self.default_generation_config
         # Generate the response
         try:
             output_ids = self.generate(

 from .mm_utils import process_image, process_images
 from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
 from .tokenizer_utils import tokenize_conversation
+from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
+from .auto_processor import VILAProcessor
 # from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
 # quick hack for remote code
         output_dir: str = None,
         vila_version: str | None = None,
         conv_mode: str | None = None,
+        copy: bool = True,
         *model_args,
         **kwargs,
     ):
         if os.path.isdir(model_path):
             model_path = model_path
+        else:
+            api = HfApi()
+            model_path = snapshot_download(model_path, local_dir=output_dir)
+            print("downloading HF model to", model_path)
         if check_dot_in_model_path(model_path) and output_dir is None:
             raise ValueError(
                 f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
         if vila_version is None:
             vila_version = get_vila_version(model_path)
         cfg_path = os.path.join(model_path, "config.json")
         config = json.load(open(cfg_path))
         config["version"] = "2.0"  # nvila tag
         config["architectures"] = ["VILAForCasualLM"]
         config["auto_map"] = {
+            "AutoProcessor": "auto_processor.VILAProcessor",
             "AutoConfig": "modeling_vila.VILAConfig",
             "AutoModel": "modeling_vila.VILAForCasualLM",
             "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
             with open(jinja_path, "w") as f:
                 f.write(jinja_template)
         json.dump(config, open(cfg_path, "w"), indent=2)
+        self.copy_remote_py_files(model_path, copy=copy)
+        ##########################################################################################
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        tokenizer = load_tokenizer_then_handle_media_tokens_and_chat_template(model_path, config)
+        tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        ##########################################################################################
     @classmethod
+    def copy_remote_py_files(cls, output_dir, copy=True):
         ## copy .py and REAMDE for next loading remote code
         current_file_path = os.path.abspath(__file__)
         current_folder = os.path.dirname(current_file_path)
         for file_name in os.listdir(current_folder):
+            if file_name == "INSTRUCTIONS.md":
+                src_fname = os.path.join(current_folder, file_name)
+                dst_fname = os.path.join(output_dir, "README.md")
+                if os.path.exists(dst_fname):
+                    old_reamde = open(dst_fname, 'r').read()
+                else:
+                    old_reamde = ""
+                with open(src_fname, 'r') as src, open(dst_fname, 'w') as dst:
+                    dst.write(src.read())
+                    dst.write(old_reamde)
+                print("[HF remote code] REAMDE ", src_fname, "to", dst_fname)
             if file_name.endswith(".py") or file_name.endswith(".jinja"):
                 full_file_name = os.path.join(current_folder, file_name)
                 if os.path.isfile(full_file_name):
+                    if copy:
+                        shutil.copy(full_file_name, output_dir)
+                        print("[HF remote code] copying", full_file_name, "to", output_dir)
+                    else:
+                        # symlink to ease development
+                        if os.path.exists(os.path.join(output_dir, file_name)):
+                            os.remove(os.path.join(output_dir, file_name))
+                        os.symlink(full_file_name, os.path.join(output_dir, file_name))
+                        print("[HF remote code] linking", full_file_name, "to", output_dir)
     def save_pretrained(self, output_dir, state_dict=None):
         if state_dict is None:
         # XGrammar tokenizer and grammar compiler
         # lazy init only when specified json output during inference
         self.grammar_compiler = None
         self.llm.resize_token_embeddings(len(self.tokenizer))
         return self.llm, self.tokenizer
         # Set up the generation config
         generation_config = generation_config or self.default_generation_config
+        # print("input_ids", input_ids.shape)
+        # print(input_ids)
+        # print(self.tokenizer.batch_decode(input_ids))
+        # print("media", {k: len(v) for k, v in media.items()})
+        # print("media_config", media_config)
+        # print("generation_config", generation_config)
+        # input("wait for debug")
         # Generate the response
         try:
             output_ids = self.generate(

siglip_encoder.py CHANGED Viewed

@@ -19,16 +19,12 @@ import torch.nn as nn
 import torch.nn.functional as F
 from accelerate.hooks import add_hook_to_module
 from einops import rearrange
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
 from transformers.image_processing_utils import BaseImageProcessor
 from transformers.models.siglip import SiglipVisionModel
-from s2wrapper import forward as multiscale_forward
-# from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-def is_deepspeed_zero3_enabled():
-    return False
 class VisionTower(nn.Module):
     def __init__(self, vision_tower, args, delay_load=False):
@@ -77,8 +73,10 @@ class VisionTower(nn.Module):
                 import torch.nn as nn
                 if is_deepspeed_zero3_enabled():
-                    import deepspeed
                     with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
                         old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
                 else:

 import torch.nn.functional as F
 from accelerate.hooks import add_hook_to_module
 from einops import rearrange
+from s2wrapper import forward as multiscale_forward
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
 from transformers.image_processing_utils import BaseImageProcessor
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.models.siglip import SiglipVisionModel
 class VisionTower(nn.Module):
     def __init__(self, vision_tower, args, delay_load=False):
                 import torch.nn as nn
                 if is_deepspeed_zero3_enabled():
+                    try:
+                        import deepspeed
+                    except ImportError:
+                        raise ImportError("DeepSpeed is not installed. Please install it with `pip install deepspeed`.")
                     with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
                         old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
                 else:

utils.py CHANGED Viewed

@@ -19,8 +19,40 @@ import os.path as osp
 from huggingface_hub import repo_exists, snapshot_download
 from huggingface_hub.utils import HFValidationError, validate_repo_id
-from transformers import AutoConfig, PretrainedConfig
 def get_model_config(config):
     default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]

 from huggingface_hub import repo_exists, snapshot_download
 from huggingface_hub.utils import HFValidationError, validate_repo_id
+from transformers import AutoConfig, PretrainedConfig, AutoTokenizer
+from .configuration_vila import VILAConfig
+from .constants import MEDIA_TOKENS
+from .tokenizer_utils import infer_stop_tokens
+def load_tokenizer_then_handle_media_tokens_and_chat_template(model_name_or_path, config: VILAConfig, model_max_length=None):
+    # TODO(ligeng): a lot of copy-paste code, refactor to make a single function
+    tokenizer = AutoTokenizer.from_pretrained(osp.join(model_name_or_path, "llm"), padding_side="right", use_fast=True, legacy=False)
+    if model_max_length is not None:
+        tokenizer.model_max_length = model_max_length
+    # Load chat template if specified.
+    if getattr(config, "chat_template", None) is not None:
+        print(f"Using chat template: {config.chat_template}")
+        fpath = os.path.join(os.path.dirname(__file__), "chat_templates", f"{config.chat_template}.jinja")
+        if not os.path.exists(fpath):
+            fpath = os.path.join(os.path.dirname(model_name_or_path), f"{config.chat_template}.jinja")
+        with open(fpath) as fd:
+            chat_template = fd.read()
+        tokenizer.chat_template = chat_template.replace("    ", "").replace("\n", "")
+    # Set stop tokens for the tokenizer
+    tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
+    tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
+    # Add media tokens to the tokenizer
+    tokenizer.media_tokens = MEDIA_TOKENS
+    tokenizer.media_token_ids = {}
+    for name, token in MEDIA_TOKENS.items():
+        tokenizer.add_tokens([token], special_tokens=True)
+        tokenizer.media_token_ids[name] = tokenizer.convert_tokens_to_ids(token)
+    return tokenizer
 def get_model_config(config):
     default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]