File size: 2,909 Bytes
f55502a e95854d f55502a e95854d f55502a 6bce680 eb202aa 6bce680 06f6212 f55502a eb202aa 06f6212 f55502a 06f6212 f55502a 06f6212 f55502a 06f6212 e95854d 06f6212 eb202aa 06f6212 eb202aa 06f6212 eb202aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
---
license: cc-by-nc-4.0
language:
- en
tags:
- vila
- nvila
- conversational
- multimodal
---
Dependency setups:
```bash
# other transformers version may also work, but we have not tested
pip install transformers==4.46 accelerate opencv-python torchvision einops pillow
pip install git+https://github.com/bfshi/scaling_on_scales.git
```
## Usage
```python
from transformers import AutoConfig, AutoModel
from termcolor import colored
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
# you can use config
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_config(config, trust_remote_code=True)
# or directly from_pretrained
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
# examples generate with raw text
res = model.generate_content([
"how are you today?"
])
print(colored(res, "cyan", attrs=["bold"]))
print("---" * 40)
# examples generate with text + image
import PIL.Image
response = model.generate_content([
PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
"describe the image?"
])
print(colored(response, "cyan", attrs=["bold"]))
```
## AutoProcessor
we also support `AutoProcessor` class to ease data preparation for training and finetuning.
```python
from transformers import AutoProcessor, AutoModel
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
gpt_conv = [ {
"role": "user",
"content": [
{"type": "image", "path": "demo_images/demo_img_1.png"},
{"type": "text", "text": "Describe this image."}
]
}]
inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
output_ids = model.generate(
input_ids=inputs.input_ids,
media={
"image": inputs.image,
},
media_config={
"image": {}
},
generation_config=model.generation_config,
max_new_tokens=256,
)
print(processor.tokenizer.decode(output_ids[0], skip_special_tokens=True))
##### the above code is equivalent to
# response = model.generate_content([
# PIL.Image.open("demo_images/demo_img_1.png"),
# "describe the image?"
# ])
# print(colored(response, "cyan", attrs=["bold"]))
```
## Model Convert
The follwing code converts a convetional NVILA model to a HF compatible model.
```python
import os, os.path as osp
from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
model_path = "Efficient-Large-Model/NVILA-Lite-2B"
output_dir = "NVILA-Lite-2B-hf-preview"
if osp.isdir(output_dir):
shutil.rmtree(output_dir)
from llava.remote_code.modeling_vila import VILAForCasualLM
VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
```
|