UncleFish commited on
Commit
854508f
1 Parent(s): 2bd5aaf

update inference code to support transformers==4.41.1

Browse files
README.md CHANGED
@@ -52,7 +52,7 @@ More technical details will come with a technical report soon.
52
 
53
  # How to use
54
 
55
- > We require the use of the development version (`"4.41.0.dev0"`) of the `transformers` library. To get it, as of 05/07/2024, one can use `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers.`
56
 
57
  ```python
58
  from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
@@ -149,4 +149,10 @@ pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https
149
  pip install open_clip_torch==2.24.0
150
  pip install einops
151
  pip install einops-exts
152
- ```
 
 
 
 
 
 
 
52
 
53
  # How to use
54
 
55
+ ~~> We require the use of the development version (`"4.41.0.dev0"`) of the `transformers` library. To get it, as of 05/07/2024, one can use `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers.`~~
56
 
57
  ```python
58
  from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
 
149
  pip install open_clip_torch==2.24.0
150
  pip install einops
151
  pip install einops-exts
152
+ pip install transformers==4.41.1
153
+ ```
154
+
155
+ # Changelog
156
+
157
+ * 05/24/2024
158
+ * update codebase to be compatiable with `transformers==4.41.1`.
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "architectures": [
3
- "Blip3ModelForConditionalGeneration"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "configuration_blip_3.Blip3Config",
7
- "AutoModelForVision2Seq": "modeling_blip_3.Blip3ModelForConditionalGeneration"
8
  },
9
- "model_type": "blip_3",
10
  "text_config": {
11
  "initial_tokenizer_len": 32012,
12
  "model_type": "phi3",
@@ -14,13 +14,13 @@
14
  "torch_dtype": "bfloat16"
15
  },
16
  "torch_dtype": "float32",
17
- "transformers_version": "4.41.0.dev0",
18
  "vision_encoder_config": {
19
  "anyres_patch_sampling": true,
20
  "image_aspect_ratio": "anyres",
21
- "model_type": "blip_3_vision_encoder"
22
  },
23
  "vision_tokenizer_config": {
24
- "model_type": "blip_3_vision_tokenizer"
25
  }
26
  }
 
1
  {
2
  "architectures": [
3
+ "XGenMMModelForConditionalGeneration"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "configuration_xgenmm.XGenMMConfig",
7
+ "AutoModelForVision2Seq": "modeling_xgenmm.XGenMMModelForConditionalGeneration"
8
  },
9
+ "model_type": "xgenmm",
10
  "text_config": {
11
  "initial_tokenizer_len": 32012,
12
  "model_type": "phi3",
 
14
  "torch_dtype": "bfloat16"
15
  },
16
  "torch_dtype": "float32",
17
+ "transformers_version": "4.41.1",
18
  "vision_encoder_config": {
19
  "anyres_patch_sampling": true,
20
  "image_aspect_ratio": "anyres",
21
+ "model_type": "xgenmm_vision_encoder"
22
  },
23
  "vision_tokenizer_config": {
24
+ "model_type": "xgenmm_vision_tokenizer"
25
  }
26
  }
configuration_blip_3.py → configuration_xgenmm.py RENAMED
@@ -4,8 +4,8 @@ from transformers import CONFIG_MAPPING
4
 
5
  logger = logging.get_logger(__name__)
6
 
7
- class Blip3VisionEncoderConfig(PretrainedConfig):
8
- model_type = "blip_3_vision_encoder"
9
 
10
  def __init__(self,
11
  model_name: str = 'ViT-H-14-378-quickgelu',
@@ -16,8 +16,8 @@ class Blip3VisionEncoderConfig(PretrainedConfig):
16
  super().__init__(**kwargs)
17
 
18
 
19
- class Blip3VisionTokenizerConfig(PretrainedConfig):
20
- model_type = "blip_3_vision_tokenizer"
21
 
22
  def __init__(self,
23
  vis_feature_dim: int = 1280,
@@ -34,8 +34,8 @@ class Blip3VisionTokenizerConfig(PretrainedConfig):
34
  super().__init__(**kwargs)
35
 
36
 
37
- class Blip3Config(PretrainedConfig):
38
- model_type = "blip_3"
39
 
40
  def __init__(self,
41
  vision_encoder_config: dict = None,
@@ -45,11 +45,11 @@ class Blip3Config(PretrainedConfig):
45
 
46
  if vision_encoder_config is None:
47
  vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
48
- logger.info("vision_encoder_config is None. initializing the Blip3VisionEncoderConfig with default values.")
49
 
50
  if vision_tokenizer_config is None:
51
  vision_tokenizer_config = {}
52
- logger.info("vision_tokenizer_config is None. Initializing the Blip3VisionTokenizerConfig with default values.")
53
 
54
  if text_config is None:
55
  text_config = {
@@ -131,9 +131,9 @@ class Blip3Config(PretrainedConfig):
131
  }
132
  logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
133
 
134
- self.vision_encoder_config = Blip3VisionEncoderConfig(**vision_encoder_config)
135
 
136
- self.vision_tokenizer_config = Blip3VisionTokenizerConfig(**vision_tokenizer_config)
137
 
138
  text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
139
  self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
@@ -147,8 +147,8 @@ class Blip3Config(PretrainedConfig):
147
  @classmethod
148
  def from_vision_encoder_vision_tokenizer_text_configs(
149
  cls,
150
- vision_encoder_config: Blip3VisionEncoderConfig,
151
- vision_tokenizer_config: Blip3VisionTokenizerConfig,
152
  text_config: PretrainedConfig,
153
  **kwargs):
154
 
 
4
 
5
  logger = logging.get_logger(__name__)
6
 
7
+ class XGenMMVisionEncoderConfig(PretrainedConfig):
8
+ model_type = "xgenmm_vision_encoder"
9
 
10
  def __init__(self,
11
  model_name: str = 'ViT-H-14-378-quickgelu',
 
16
  super().__init__(**kwargs)
17
 
18
 
19
+ class XGenMMVisionTokenizerConfig(PretrainedConfig):
20
+ model_type = "xgenmm_vision_tokenizer"
21
 
22
  def __init__(self,
23
  vis_feature_dim: int = 1280,
 
34
  super().__init__(**kwargs)
35
 
36
 
37
+ class XGenMMConfig(PretrainedConfig):
38
+ model_type = "xgenmm"
39
 
40
  def __init__(self,
41
  vision_encoder_config: dict = None,
 
45
 
46
  if vision_encoder_config is None:
47
  vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
48
+ logger.info("vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values.")
49
 
50
  if vision_tokenizer_config is None:
51
  vision_tokenizer_config = {}
52
+ logger.info("vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values.")
53
 
54
  if text_config is None:
55
  text_config = {
 
131
  }
132
  logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
133
 
134
+ self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config)
135
 
136
+ self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(**vision_tokenizer_config)
137
 
138
  text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
139
  self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
 
147
  @classmethod
148
  def from_vision_encoder_vision_tokenizer_text_configs(
149
  cls,
150
+ vision_encoder_config: XGenMMVisionEncoderConfig,
151
+ vision_tokenizer_config: XGenMMVisionTokenizerConfig,
152
  text_config: PretrainedConfig,
153
  **kwargs):
154
 
demo.ipynb CHANGED
@@ -2,21 +2,44 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria\n",
10
  "import torch\n",
11
- "model = AutoModelForVision2Seq.from_pretrained(\"Salesforce/blip3-phi3-mini-instruct-r-v1\", trust_remote_code=True)\n",
12
- "tokenizer = AutoTokenizer.from_pretrained(\"Salesforce/blip3-phi3-mini-instruct-r-v1\", trust_remote_code=True, use_fast=False, legacy=False)\n",
13
- "image_processor = AutoImageProcessor.from_pretrained(\"Salesforce/blip3-phi3-mini-instruct-r-v1\", trust_remote_code=True)\n",
 
14
  "tokenizer = model.update_special_tokens(tokenizer)"
15
  ]
16
  },
17
  {
18
  "cell_type": "code",
19
- "execution_count": 15,
20
  "metadata": {},
21
  "outputs": [],
22
  "source": [
@@ -46,17 +69,18 @@
46
  },
47
  {
48
  "cell_type": "code",
49
- "execution_count": null,
50
  "metadata": {},
51
  "outputs": [],
52
  "source": [
53
  "model = model.to('cuda')\n",
54
- "model.eval()"
 
55
  ]
56
  },
57
  {
58
  "cell_type": "code",
59
- "execution_count": 18,
60
  "metadata": {},
61
  "outputs": [
62
  {
@@ -73,6 +97,13 @@
73
  },
74
  "output_type": "display_data"
75
  },
 
 
 
 
 
 
 
76
  {
77
  "name": "stdout",
78
  "output_type": "stream",
@@ -223,7 +254,6 @@
223
  }
224
  ],
225
  "source": [
226
- "tokenizer.padding_side = \"left\"\n",
227
  "for sample in data:\n",
228
  " img = PIL.Image.open(sample['image_path'])\n",
229
  " display.display(Image(filename=sample['image_path'], width=300))\n",
@@ -262,7 +292,7 @@
262
  "name": "python",
263
  "nbconvert_exporter": "python",
264
  "pygments_lexer": "ipython3",
265
- "version": "3.9.19"
266
  }
267
  },
268
  "nbformat": 4,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "0585fe10e4854d99857d74e836379a47",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ },
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
27
+ ]
28
+ }
29
+ ],
30
  "source": [
31
  "from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria\n",
32
  "import torch\n",
33
+ "model_name_or_path = \"Salesforce/xgen-mm-phi3-mini-instruct-r-v1\"\n",
34
+ "model = AutoModelForVision2Seq.from_pretrained(model_name_or_path, trust_remote_code=True)\n",
35
+ "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)\n",
36
+ "image_processor = AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)\n",
37
  "tokenizer = model.update_special_tokens(tokenizer)"
38
  ]
39
  },
40
  {
41
  "cell_type": "code",
42
+ "execution_count": 3,
43
  "metadata": {},
44
  "outputs": [],
45
  "source": [
 
69
  },
70
  {
71
  "cell_type": "code",
72
+ "execution_count": 4,
73
  "metadata": {},
74
  "outputs": [],
75
  "source": [
76
  "model = model.to('cuda')\n",
77
+ "model.eval()\n",
78
+ "tokenizer.padding_side = \"left\""
79
  ]
80
  },
81
  {
82
  "cell_type": "code",
83
+ "execution_count": 5,
84
  "metadata": {},
85
  "outputs": [
86
  {
 
97
  },
98
  "output_type": "display_data"
99
  },
100
+ {
101
+ "name": "stderr",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "You are not running the flash-attention implementation, expect numerical differences.\n"
105
+ ]
106
+ },
107
  {
108
  "name": "stdout",
109
  "output_type": "stream",
 
254
  }
255
  ],
256
  "source": [
 
257
  "for sample in data:\n",
258
  " img = PIL.Image.open(sample['image_path'])\n",
259
  " display.display(Image(filename=sample['image_path'], width=300))\n",
 
292
  "name": "python",
293
  "nbconvert_exporter": "python",
294
  "pygments_lexer": "ipython3",
295
+ "version": "3.10.14"
296
  }
297
  },
298
  "nbformat": 4,
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
  "pad_token_id": 32000,
6
- "transformers_version": "4.41.0.dev0"
7
  }
 
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
  "pad_token_id": 32000,
6
+ "transformers_version": "4.41.1"
7
  }
modeling_blip_3.py → modeling_xgenmm.py RENAMED
@@ -4,13 +4,13 @@ import open_clip
4
  from typing import List, Optional, Tuple, Union
5
  from .utils import check_embedding_fns
6
  from .vlm import InstructPerceiverResampler, KosmosInstruct
7
- from .configuration_blip_3 import Blip3VisionEncoderConfig, Blip3VisionTokenizerConfig, Blip3Config
8
 
9
- class Blip3VisionEncoder(PreTrainedModel):
10
  main_input_name = "pixel_values"
11
- config_class = Blip3VisionEncoderConfig
12
 
13
- def __init__(self, config: Blip3VisionEncoderConfig):
14
  super().__init__(config)
15
  if config.model_name != 'ViT-H-14-378-quickgelu':
16
  raise ValueError(f"Unsupported model {config.model_name}. New vision models will be added soon.")
@@ -25,9 +25,9 @@ class Blip3VisionEncoder(PreTrainedModel):
25
 
26
 
27
  # vision tokenizer
28
- class Blip3VisionTokenizer(PreTrainedModel):
29
- config_class = Blip3VisionTokenizerConfig
30
- def __init__(self, config: Blip3VisionTokenizerConfig):
31
  super().__init__(config)
32
  self.model = InstructPerceiverResampler(
33
  dim_llm=config.lang_embedding_dim,
@@ -42,15 +42,15 @@ class Blip3VisionTokenizer(PreTrainedModel):
42
  vision_attn_masks: torch.Tensor):
43
  return self.model(vision_features, vision_attn_masks)
44
 
45
- # Blip3 model
46
- class Blip3ModelForConditionalGeneration(PreTrainedModel):
47
- config_class = Blip3Config
48
 
49
- def __init__(self, config: Blip3Config):
50
  super().__init__(config)
51
 
52
  # vision encoder initialization
53
- vision_encoder = Blip3VisionEncoder(config.vision_encoder_config).model
54
  vision_encoder.visual.output_tokens = True
55
  vision_encoder = vision_encoder.visual
56
 
@@ -67,7 +67,7 @@ class Blip3ModelForConditionalGeneration(PreTrainedModel):
67
  config.vision_tokenizer_config.lang_embedding_dim = overwrite
68
  print(f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}.")
69
 
70
- vision_tokenizer = Blip3VisionTokenizer(config.vision_tokenizer_config).model
71
 
72
  self.vlm = KosmosInstruct(
73
  vision_encoder=vision_encoder,
 
4
  from typing import List, Optional, Tuple, Union
5
  from .utils import check_embedding_fns
6
  from .vlm import InstructPerceiverResampler, KosmosInstruct
7
+ from .configuration_xgenmm import XGenMMVisionEncoderConfig, XGenMMVisionTokenizerConfig, XGenMMConfig
8
 
9
+ class XGenMMVisionEncoder(PreTrainedModel):
10
  main_input_name = "pixel_values"
11
+ config_class = XGenMMVisionEncoderConfig
12
 
13
+ def __init__(self, config: XGenMMVisionEncoderConfig):
14
  super().__init__(config)
15
  if config.model_name != 'ViT-H-14-378-quickgelu':
16
  raise ValueError(f"Unsupported model {config.model_name}. New vision models will be added soon.")
 
25
 
26
 
27
  # vision tokenizer
28
+ class XGenMMVisionTokenizer(PreTrainedModel):
29
+ config_class = XGenMMVisionTokenizerConfig
30
+ def __init__(self, config: XGenMMVisionTokenizerConfig):
31
  super().__init__(config)
32
  self.model = InstructPerceiverResampler(
33
  dim_llm=config.lang_embedding_dim,
 
42
  vision_attn_masks: torch.Tensor):
43
  return self.model(vision_features, vision_attn_masks)
44
 
45
+ # XGenMM model
46
+ class XGenMMModelForConditionalGeneration(PreTrainedModel):
47
+ config_class = XGenMMConfig
48
 
49
+ def __init__(self, config: XGenMMConfig):
50
  super().__init__(config)
51
 
52
  # vision encoder initialization
53
+ vision_encoder = XGenMMVisionEncoder(config.vision_encoder_config).model
54
  vision_encoder.visual.output_tokens = True
55
  vision_encoder = vision_encoder.visual
56
 
 
67
  config.vision_tokenizer_config.lang_embedding_dim = overwrite
68
  print(f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}.")
69
 
70
+ vision_tokenizer = XGenMMVisionTokenizer(config.vision_tokenizer_config).model
71
 
72
  self.vlm = KosmosInstruct(
73
  vision_encoder=vision_encoder,
setup.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
2
+ pip install open_clip_torch==2.24.0
3
+ pip install einops
4
+ pip install einops-exts
5
+ pip install transformers==4.41.1
6
+ # optional
7
+ pip install ipywidgets
utils.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  import ast
3
  import math
4
  from PIL import Image
5
-
6
 
7
  def has_fn(model, fn_name):
8
  """Check if model has a function fn_name"""
 
2
  import ast
3
  import math
4
  from PIL import Image
5
+ from packaging.version import Version
6
 
7
  def has_fn(model, fn_name):
8
  """Check if model has a function fn_name"""
vlm.py CHANGED
@@ -10,6 +10,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
10
  from dataclasses import dataclass
11
  from transformers import CLIPVisionModel
12
  import transformers
 
13
 
14
  from .utils import num_params, getattr_recursive, stack_with_padding, get_anyres_image_grid_shape, unpad_image
15
 
@@ -1512,7 +1513,7 @@ class KosmosInstruct(VLMWithLanguageStream):
1512
  padding_side="left",
1513
  num_beams=num_beams,
1514
  )
1515
- if transformers.__version__ == '4.41.0.dev0':
1516
  output = self.lang_model.generate(
1517
  **new_inputs,
1518
  num_beams=num_beams,
@@ -1520,12 +1521,7 @@ class KosmosInstruct(VLMWithLanguageStream):
1520
  **kwargs,
1521
  )
1522
  else:
1523
- output = self.lang_model.generate(
1524
- **new_inputs,
1525
- past_key_values=past_key_values,
1526
- num_beams=num_beams,
1527
- use_cache=True,
1528
- **kwargs,
1529
- )
1530
  self._post_forward_hook()
1531
  return output
 
10
  from dataclasses import dataclass
11
  from transformers import CLIPVisionModel
12
  import transformers
13
+ from packaging.version import Version
14
 
15
  from .utils import num_params, getattr_recursive, stack_with_padding, get_anyres_image_grid_shape, unpad_image
16
 
 
1513
  padding_side="left",
1514
  num_beams=num_beams,
1515
  )
1516
+ if Version(transformers.__version__) >= Version('4.41.1'):
1517
  output = self.lang_model.generate(
1518
  **new_inputs,
1519
  num_beams=num_beams,
 
1521
  **kwargs,
1522
  )
1523
  else:
1524
+ raise ValueError("Please upgrade transformers to version 4.41.1 or higher.")
1525
+
 
 
 
 
 
1526
  self._post_forward_hook()
1527
  return output