DongfuJiang
commited on
Commit
•
ed5e31e
1
Parent(s):
4935da6
Update README.md
Browse files
README.md
CHANGED
@@ -82,6 +82,121 @@ It's fine-tuned on [Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/M
|
|
82 |
|
83 |
### Run example inference:
|
84 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
```
|
86 |
|
87 |
### Training
|
|
|
82 |
|
83 |
### Run example inference:
|
84 |
```python
|
85 |
+
|
86 |
+
import requests
|
87 |
+
import torch
|
88 |
+
from PIL import Image
|
89 |
+
from io import BytesIO
|
90 |
+
|
91 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
92 |
+
from transformers.image_utils import load_image
|
93 |
+
|
94 |
+
|
95 |
+
processor = AutoProcessor.from_pretrained("TIGER-Lab/Mantis-8B-Idefics2") # do_image_splitting is False by default
|
96 |
+
model = AutoModelForVision2Seq.from_pretrained(
|
97 |
+
"TIGER-Lab/Mantis-8B-Idefics2",
|
98 |
+
device_map="auto"
|
99 |
+
)
|
100 |
+
generation_kwargs = {
|
101 |
+
"max_new_tokens": 1024,
|
102 |
+
"num_beams": 1,
|
103 |
+
"do_sample": False
|
104 |
+
}
|
105 |
+
|
106 |
+
# Note that passing the image urls (instead of the actual pil images) to the processor is also possible
|
107 |
+
image1 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
|
108 |
+
image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
|
109 |
+
image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
|
110 |
+
images = [image1, image2, image3]
|
111 |
+
|
112 |
+
|
113 |
+
query1 = "What cities image 1, image 2, and image 3 belong to respectively? Answer me in order."
|
114 |
+
query2 = "Which one do you recommend for a visit? and why?"
|
115 |
+
query3 = "Which picture has most cars in it?"
|
116 |
+
|
117 |
+
### Chat
|
118 |
+
### Round 1
|
119 |
+
messages = [
|
120 |
+
{
|
121 |
+
"role": "user",
|
122 |
+
"content": [
|
123 |
+
{"type": "image"},
|
124 |
+
{"type": "image"},
|
125 |
+
{"type": "image"},
|
126 |
+
{"type": "text", "text": query1},
|
127 |
+
]
|
128 |
+
}
|
129 |
+
]
|
130 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
131 |
+
inputs = processor(text=prompt, images=images, return_tensors="pt")
|
132 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
133 |
+
|
134 |
+
# Generate
|
135 |
+
generated_ids = model.generate(**inputs, **generation_kwargs)
|
136 |
+
response = processor.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
137 |
+
print("User: ", query1)
|
138 |
+
print("ASSISTANT: ", response[0])
|
139 |
+
|
140 |
+
### Round 2
|
141 |
+
messages.append(
|
142 |
+
{
|
143 |
+
"role": "assistant",
|
144 |
+
"content": [
|
145 |
+
{"type": "text", "text": response[0]},
|
146 |
+
]
|
147 |
+
}
|
148 |
+
)
|
149 |
+
messages.append(
|
150 |
+
{
|
151 |
+
"role": "user",
|
152 |
+
"content": [
|
153 |
+
{"type": "text", "text": query2},
|
154 |
+
]
|
155 |
+
}
|
156 |
+
)
|
157 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
158 |
+
inputs = processor(text=prompt, images=images, return_tensors="pt")
|
159 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
160 |
+
generated_ids = model.generate(**inputs, **generation_kwargs)
|
161 |
+
response = processor.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
162 |
+
print("User: ", query2)
|
163 |
+
print("ASSISTANT: ", response[0])
|
164 |
+
|
165 |
+
### Round 3
|
166 |
+
messages.append(
|
167 |
+
{
|
168 |
+
"role": "assistant",
|
169 |
+
"content": [
|
170 |
+
{"type": "text", "text": response[0]},
|
171 |
+
]
|
172 |
+
}
|
173 |
+
)
|
174 |
+
messages.append(
|
175 |
+
{
|
176 |
+
"role": "user",
|
177 |
+
"content": [
|
178 |
+
{"type": "text", "text": query3},
|
179 |
+
]
|
180 |
+
}
|
181 |
+
)
|
182 |
+
|
183 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
184 |
+
inputs = processor(text=prompt, images=images, return_tensors="pt")
|
185 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
186 |
+
generated_ids = model.generate(**inputs, **generation_kwargs)
|
187 |
+
response = processor.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
188 |
+
print("User: ", query3)
|
189 |
+
print("ASSISTANT: ", response[0])
|
190 |
+
|
191 |
+
|
192 |
+
"""
|
193 |
+
User: What cities image 1, image 2, and image 3 belong to respectively? Answer me in order.
|
194 |
+
ASSISTANT: Chicago, New York, San Francisco
|
195 |
+
User: Which one do you recommend for a visit? and why?
|
196 |
+
ASSISTANT: New York - because it's a bustling metropolis with iconic landmarks like the Statue of Liberty and the Empire State Building.
|
197 |
+
User: Which picture has most cars in it?
|
198 |
+
ASSISTANT: Image 3
|
199 |
+
"""
|
200 |
```
|
201 |
|
202 |
### Training
|