czczup commited on
Commit
1994703
1 Parent(s): 6d7aff9

Update README.md

Browse files
Files changed (3) hide show
  1. README.md +3 -7
  2. config.json +2 -58
  3. vocab.json +0 -0
README.md CHANGED
@@ -2,10 +2,8 @@
2
  license: mit
3
  pipeline_tag: image-text-to-text
4
  library_name: transformers
5
- base_model:
6
- - OpenGVLab/InternViT-300M-448px-V2_5
7
- - Qwen/Qwen2.5-3B-Instruct
8
- base_model_relation: merge
9
  language:
10
  - multilingual
11
  tags:
@@ -82,8 +80,6 @@ If `ImportError` occurs while executing this case, please install the required d
82
 
83
  When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
84
 
85
- question = 'Describe this video in detail.'
86
-
87
  ```python
88
  from lmdeploy import pipeline, TurbomindEngineConfig
89
  from lmdeploy.vl import load_image
@@ -186,7 +182,7 @@ print(response)
186
 
187
  ## License
188
 
189
- This project is released under the MIT License. This project uses the pre-trained Qwen2.5-72B-Instruct as a component, which is licensed under the Qwen License.
190
 
191
  ## Citation
192
 
 
2
  license: mit
3
  pipeline_tag: image-text-to-text
4
  library_name: transformers
5
+ base_model: OpenGVLab/InternVL2_5-4B
6
+ base_model_relation: quantized
 
 
7
  language:
8
  - multilingual
9
  tags:
 
80
 
81
  When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
82
 
 
 
83
  ```python
84
  from lmdeploy import pipeline, TurbomindEngineConfig
85
  from lmdeploy.vl import load_image
 
182
 
183
  ## License
184
 
185
+ This project is released under the MIT License. This project uses the pre-trained Qwen2.5-3B-Instruct as a component, which is licensed under the Apache License 2.0.
186
 
187
  ## Citation
188
 
config.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
- "_commit_hash": "6d97fd21c37f3d744961189a8fbf2b0f228a85ec",
3
- "_name_or_path": "/models/141/huggingface_hub/hub/models--OpenGVLab--InternVL2_5-4B/snapshots/6d97fd21c37f3d744961189a8fbf2b0f228a85ec/",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
@@ -20,6 +19,7 @@
20
  "architectures": [
21
  "Qwen2ForCausalLM"
22
  ],
 
23
  "attention_dropout": 0.0,
24
  "bad_words_ids": null,
25
  "begin_suppress_tokens": null,
@@ -111,93 +111,37 @@
111
  "select_layer": -1,
112
  "template": "internvl2_5",
113
  "torch_dtype": "float16",
114
- "transformers_version": null,
115
  "use_backbone_lora": 0,
116
- "use_cache": false,
117
  "use_llm_lora": 0,
118
  "use_thumbnail": true,
119
  "vision_config": {
120
  "_attn_implementation_autoset": true,
121
- "_name_or_path": "",
122
- "add_cross_attention": false,
123
  "architectures": [
124
  "InternVisionModel"
125
  ],
126
  "attention_dropout": 0.0,
127
- "bad_words_ids": null,
128
- "begin_suppress_tokens": null,
129
- "bos_token_id": null,
130
- "chunk_size_feed_forward": 0,
131
- "cross_attention_hidden_size": null,
132
- "decoder_start_token_id": null,
133
- "diversity_penalty": 0.0,
134
- "do_sample": false,
135
  "drop_path_rate": 0.0,
136
  "dropout": 0.0,
137
- "early_stopping": false,
138
- "encoder_no_repeat_ngram_size": 0,
139
- "eos_token_id": null,
140
- "exponential_decay_length_penalty": null,
141
- "finetuning_task": null,
142
- "forced_bos_token_id": null,
143
- "forced_eos_token_id": null,
144
  "hidden_act": "gelu",
145
  "hidden_size": 1024,
146
- "id2label": {
147
- "0": "LABEL_0",
148
- "1": "LABEL_1"
149
- },
150
  "image_size": 448,
151
  "initializer_factor": 1.0,
152
  "initializer_range": 0.02,
153
  "intermediate_size": 4096,
154
- "is_decoder": false,
155
- "is_encoder_decoder": false,
156
- "label2id": {
157
- "LABEL_0": 0,
158
- "LABEL_1": 1
159
- },
160
  "layer_norm_eps": 1e-06,
161
- "length_penalty": 1.0,
162
- "max_length": 20,
163
- "min_length": 0,
164
  "model_type": "intern_vit_6b",
165
- "no_repeat_ngram_size": 0,
166
  "norm_type": "layer_norm",
167
  "num_attention_heads": 16,
168
- "num_beam_groups": 1,
169
- "num_beams": 1,
170
  "num_channels": 3,
171
  "num_hidden_layers": 24,
172
- "num_return_sequences": 1,
173
  "output_attentions": false,
174
  "output_hidden_states": false,
175
- "output_scores": false,
176
- "pad_token_id": null,
177
  "patch_size": 14,
178
- "prefix": null,
179
- "problem_type": null,
180
- "pruned_heads": {},
181
  "qk_normalization": false,
182
  "qkv_bias": true,
183
- "remove_invalid_values": false,
184
- "repetition_penalty": 1.0,
185
  "return_dict": true,
186
- "return_dict_in_generate": false,
187
- "sep_token_id": null,
188
- "suppress_tokens": null,
189
- "task_specific_params": null,
190
- "temperature": 1.0,
191
- "tf_legacy_loss": false,
192
- "tie_encoder_decoder": false,
193
- "tie_word_embeddings": true,
194
- "tokenizer_class": null,
195
- "top_k": 50,
196
- "top_p": 1.0,
197
  "torch_dtype": "bfloat16",
198
- "torchscript": false,
199
  "transformers_version": "4.47.0",
200
- "typical_p": 1.0,
201
  "use_bfloat16": true,
202
  "use_flash_attn": false
203
  }
 
1
  {
2
+ "_commit_hash": null,
 
3
  "architectures": [
4
  "InternVLChatModel"
5
  ],
 
19
  "architectures": [
20
  "Qwen2ForCausalLM"
21
  ],
22
+ "_attn_implementation": "eager",
23
  "attention_dropout": 0.0,
24
  "bad_words_ids": null,
25
  "begin_suppress_tokens": null,
 
111
  "select_layer": -1,
112
  "template": "internvl2_5",
113
  "torch_dtype": "float16",
 
114
  "use_backbone_lora": 0,
 
115
  "use_llm_lora": 0,
116
  "use_thumbnail": true,
117
  "vision_config": {
118
  "_attn_implementation_autoset": true,
 
 
119
  "architectures": [
120
  "InternVisionModel"
121
  ],
122
  "attention_dropout": 0.0,
 
 
 
 
 
 
 
 
123
  "drop_path_rate": 0.0,
124
  "dropout": 0.0,
 
 
 
 
 
 
 
125
  "hidden_act": "gelu",
126
  "hidden_size": 1024,
 
 
 
 
127
  "image_size": 448,
128
  "initializer_factor": 1.0,
129
  "initializer_range": 0.02,
130
  "intermediate_size": 4096,
 
 
 
 
 
 
131
  "layer_norm_eps": 1e-06,
 
 
 
132
  "model_type": "intern_vit_6b",
 
133
  "norm_type": "layer_norm",
134
  "num_attention_heads": 16,
 
 
135
  "num_channels": 3,
136
  "num_hidden_layers": 24,
 
137
  "output_attentions": false,
138
  "output_hidden_states": false,
 
 
139
  "patch_size": 14,
 
 
 
140
  "qk_normalization": false,
141
  "qkv_bias": true,
 
 
142
  "return_dict": true,
 
 
 
 
 
 
 
 
 
 
 
143
  "torch_dtype": "bfloat16",
 
144
  "transformers_version": "4.47.0",
 
145
  "use_bfloat16": true,
146
  "use_flash_attn": false
147
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff