update model weights to bf16
Browse files- config.json +2 -2
- generation_config.json +2 -2
- model-00001-of-00004.safetensors → model-00001-of-00002.safetensors +2 -2
- model-00002-of-00004.safetensors → model-00002-of-00002.safetensors +2 -2
- model-00003-of-00004.safetensors +0 -3
- model-00004-of-00004.safetensors +0 -3
- model.safetensors.index.json +0 -0
- modeling_xgenmm.py +16 -6
- setup.sh +1 -1
- special_tokens_map.json +1 -7
- tokenizer_config.json +2 -1
config.json
CHANGED
@@ -13,8 +13,8 @@
|
|
13 |
"sliding_window": 2047,
|
14 |
"torch_dtype": "bfloat16"
|
15 |
},
|
16 |
-
"torch_dtype": "
|
17 |
-
"transformers_version": "4.
|
18 |
"vision_encoder_config": {
|
19 |
"anyres_patch_sampling": true,
|
20 |
"image_aspect_ratio": "anyres",
|
|
|
13 |
"sliding_window": 2047,
|
14 |
"torch_dtype": "bfloat16"
|
15 |
},
|
16 |
+
"torch_dtype": "bfloat16",
|
17 |
+
"transformers_version": "4.44.2",
|
18 |
"vision_encoder_config": {
|
19 |
"anyres_patch_sampling": true,
|
20 |
"image_aspect_ratio": "anyres",
|
generation_config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 1,
|
4 |
-
"eos_token_id":
|
5 |
"pad_token_id": 32000,
|
6 |
-
"transformers_version": "4.
|
7 |
}
|
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 32007,
|
5 |
"pad_token_id": 32000,
|
6 |
+
"transformers_version": "4.44.2"
|
7 |
}
|
model-00001-of-00004.safetensors → model-00001-of-00002.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21e0452442b5c189d7f0a1078d243a4ad705036e12703f25f81f0711ae478d70
|
3 |
+
size 4972926984
|
model-00002-of-00004.safetensors → model-00002-of-00002.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a5e8bd3fbe75d20605d6760268337170a544f04bc4dfac00c2cba65981d7deb
|
3 |
+
size 3745680670
|
model-00003-of-00004.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:cf6be012e8534481a9ca4e8dd8f3e482de42af52cb86b23c463b1e55ab5a40a2
|
3 |
-
size 4983112168
|
|
|
|
|
|
|
|
model-00004-of-00004.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:71e9607ebf4884b3913619693b547dd4ae9bb66d9e6de42e80bcb5357de36914
|
3 |
-
size 2508236156
|
|
|
|
|
|
|
|
model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
modeling_xgenmm.py
CHANGED
@@ -162,6 +162,7 @@ class XGenMMConfig(PretrainedConfig):
|
|
162 |
"task_specific_params": None,
|
163 |
"problem_type": None,
|
164 |
"model_type": "phi3",
|
|
|
165 |
}
|
166 |
logger.info(
|
167 |
"text_config is None. Initializing the text config with default values (`Phi3Config`)."
|
@@ -1031,7 +1032,7 @@ class VLM(nn.Module):
|
|
1031 |
num_additional_embeddings=len(self.special_tokens),
|
1032 |
_weight=self.lang_model.get_input_embeddings().weight,
|
1033 |
pad_token_id=self.pad_token_id,
|
1034 |
-
)
|
1035 |
if hasattr(input_embeds, "additional_embedding"):
|
1036 |
input_embeds.additional_embedding.weight.data.normal_(
|
1037 |
mean=0.0,
|
@@ -1052,7 +1053,7 @@ class VLM(nn.Module):
|
|
1052 |
if hasattr(self.lang_model.get_output_embeddings(), "bias")
|
1053 |
else None
|
1054 |
),
|
1055 |
-
)
|
1056 |
if hasattr(out_embeds, "additional_fc"):
|
1057 |
out_embeds.additional_fc.weight.data.normal_(
|
1058 |
mean=0.0,
|
@@ -1642,7 +1643,8 @@ class VLMWithLanguageStream(VLM):
|
|
1642 |
if has_labels:
|
1643 |
new_label = labels[i].clone()
|
1644 |
|
1645 |
-
for img_num
|
|
|
1646 |
# Get vision token attention mask for padded llava-style any resolution image tokens.
|
1647 |
if self.image_aspect_ratio == "anyres":
|
1648 |
num_vis_tokens = vision_tokens[i][img_num].shape[0]
|
@@ -1662,6 +1664,10 @@ class VLMWithLanguageStream(VLM):
|
|
1662 |
vis_attention_mask = torch.ones(
|
1663 |
num_vis_tokens, dtype=torch.long
|
1664 |
).to(attention_mask.device)
|
|
|
|
|
|
|
|
|
1665 |
|
1666 |
new_embed = torch.cat(
|
1667 |
(
|
@@ -2029,11 +2035,15 @@ class XGenMMModelForConditionalGeneration(PreTrainedModel):
|
|
2029 |
|
2030 |
# vision encoder initialization
|
2031 |
vision_encoder = AutoModel.from_pretrained(
|
2032 |
-
config.vision_encoder_config.model_name
|
|
|
2033 |
).vision_model
|
2034 |
|
2035 |
# language model initialization
|
2036 |
-
language_model = AutoModelForCausalLM.from_config(
|
|
|
|
|
|
|
2037 |
check_embedding_fns(language_model)
|
2038 |
# Update _tied_weights_keys using the base model used.
|
2039 |
if language_model._tied_weights_keys is not None:
|
@@ -2052,7 +2062,7 @@ class XGenMMModelForConditionalGeneration(PreTrainedModel):
|
|
2052 |
f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}."
|
2053 |
)
|
2054 |
|
2055 |
-
vision_tokenizer = XGenMMVisionTokenizer(config.vision_tokenizer_config).model
|
2056 |
|
2057 |
self.vlm = XGenMMPerceiver(
|
2058 |
vision_encoder=vision_encoder,
|
|
|
162 |
"task_specific_params": None,
|
163 |
"problem_type": None,
|
164 |
"model_type": "phi3",
|
165 |
+
"_attn_implementation": "flash_attention_2",
|
166 |
}
|
167 |
logger.info(
|
168 |
"text_config is None. Initializing the text config with default values (`Phi3Config`)."
|
|
|
1032 |
num_additional_embeddings=len(self.special_tokens),
|
1033 |
_weight=self.lang_model.get_input_embeddings().weight,
|
1034 |
pad_token_id=self.pad_token_id,
|
1035 |
+
).to(self.lang_model.dtype)
|
1036 |
if hasattr(input_embeds, "additional_embedding"):
|
1037 |
input_embeds.additional_embedding.weight.data.normal_(
|
1038 |
mean=0.0,
|
|
|
1053 |
if hasattr(self.lang_model.get_output_embeddings(), "bias")
|
1054 |
else None
|
1055 |
),
|
1056 |
+
).to(self.lang_model.dtype)
|
1057 |
if hasattr(out_embeds, "additional_fc"):
|
1058 |
out_embeds.additional_fc.weight.data.normal_(
|
1059 |
mean=0.0,
|
|
|
1643 |
if has_labels:
|
1644 |
new_label = labels[i].clone()
|
1645 |
|
1646 |
+
for img_num in range(len(image_token_idxs)):
|
1647 |
+
img_idx = image_token_idxs[img_num]
|
1648 |
# Get vision token attention mask for padded llava-style any resolution image tokens.
|
1649 |
if self.image_aspect_ratio == "anyres":
|
1650 |
num_vis_tokens = vision_tokens[i][img_num].shape[0]
|
|
|
1664 |
vis_attention_mask = torch.ones(
|
1665 |
num_vis_tokens, dtype=torch.long
|
1666 |
).to(attention_mask.device)
|
1667 |
+
|
1668 |
+
# Offset the rest of image tokens with current num_vis_tokens
|
1669 |
+
for j in range(img_num+1, len(image_token_idxs)):
|
1670 |
+
image_token_idxs[j] += (num_vis_tokens - 1)
|
1671 |
|
1672 |
new_embed = torch.cat(
|
1673 |
(
|
|
|
2035 |
|
2036 |
# vision encoder initialization
|
2037 |
vision_encoder = AutoModel.from_pretrained(
|
2038 |
+
config.vision_encoder_config.model_name,
|
2039 |
+
torch_dtype=config.text_config.torch_dtype,
|
2040 |
).vision_model
|
2041 |
|
2042 |
# language model initialization
|
2043 |
+
language_model = AutoModelForCausalLM.from_config(
|
2044 |
+
config.text_config,
|
2045 |
+
torch_dtype=config.text_config.torch_dtype,
|
2046 |
+
)
|
2047 |
check_embedding_fns(language_model)
|
2048 |
# Update _tied_weights_keys using the base model used.
|
2049 |
if language_model._tied_weights_keys is not None:
|
|
|
2062 |
f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}."
|
2063 |
)
|
2064 |
|
2065 |
+
vision_tokenizer = XGenMMVisionTokenizer(config.vision_tokenizer_config).model.to(language_model.dtype)
|
2066 |
|
2067 |
self.vlm = XGenMMPerceiver(
|
2068 |
vision_encoder=vision_encoder,
|
setup.sh
CHANGED
@@ -2,6 +2,6 @@ pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https
|
|
2 |
pip install open_clip_torch==2.24.0
|
3 |
pip install einops
|
4 |
pip install einops-exts
|
5 |
-
pip install transformers==4.
|
6 |
# optional
|
7 |
pip install ipywidgets
|
|
|
2 |
pip install open_clip_torch==2.24.0
|
3 |
pip install einops
|
4 |
pip install einops-exts
|
5 |
+
pip install transformers==4.44.2
|
6 |
# optional
|
7 |
pip install ipywidgets
|
special_tokens_map.json
CHANGED
@@ -6,13 +6,7 @@
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
9 |
-
"eos_token":
|
10 |
-
"content": "<|endoftext|>",
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": false,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
"pad_token": {
|
17 |
"content": "<pad>",
|
18 |
"lstrip": false,
|
|
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
9 |
+
"eos_token": "<|end|>",
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
"pad_token": {
|
11 |
"content": "<pad>",
|
12 |
"lstrip": false,
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"add_bos_token": false,
|
3 |
"add_eos_token": false,
|
|
|
4 |
"added_tokens_decoder": {
|
5 |
"0": {
|
6 |
"content": "<unk>",
|
@@ -126,7 +127,7 @@
|
|
126 |
"bos_token": "<s>",
|
127 |
"chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
128 |
"clean_up_tokenization_spaces": false,
|
129 |
-
"eos_token": "<|
|
130 |
"legacy": false,
|
131 |
"model_max_length": 4096,
|
132 |
"pad_token": "<pad>",
|
|
|
1 |
{
|
2 |
"add_bos_token": false,
|
3 |
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
"added_tokens_decoder": {
|
6 |
"0": {
|
7 |
"content": "<unk>",
|
|
|
127 |
"bos_token": "<s>",
|
128 |
"chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
129 |
"clean_up_tokenization_spaces": false,
|
130 |
+
"eos_token": "<|end|>",
|
131 |
"legacy": false,
|
132 |
"model_max_length": 4096,
|
133 |
"pad_token": "<pad>",
|