Video-Text-to-Text
Transformers
Safetensors
English
llava
text-generation
multimodal
Eval Results
Inference Endpoints
ZhangYuanhan commited on
Commit
2c4f31e
1 Parent(s): 7c1b5a5

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +201 -0
config.json ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/bn/vl-research/checkpoints/onevision/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mid_to_final_next_2p4m_am9",
3
+ "add_time_instruction": true,
4
+ "add_faster_video": false,
5
+ "architectures": [
6
+ "LlavaQwenForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 151643,
10
+ "eos_token_id": 151645,
11
+ "force_sample": true,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 3584,
14
+ "image_aspect_ratio": "anyres_max_9",
15
+ "image_crop_resolution": null,
16
+ "image_grid_pinpoints": [
17
+ [
18
+ 384,
19
+ 384
20
+ ],
21
+ [
22
+ 384,
23
+ 768
24
+ ],
25
+ [
26
+ 384,
27
+ 1152
28
+ ],
29
+ [
30
+ 384,
31
+ 1536
32
+ ],
33
+ [
34
+ 384,
35
+ 1920
36
+ ],
37
+ [
38
+ 384,
39
+ 2304
40
+ ],
41
+ [
42
+ 768,
43
+ 384
44
+ ],
45
+ [
46
+ 768,
47
+ 768
48
+ ],
49
+ [
50
+ 768,
51
+ 1152
52
+ ],
53
+ [
54
+ 768,
55
+ 1536
56
+ ],
57
+ [
58
+ 768,
59
+ 1920
60
+ ],
61
+ [
62
+ 768,
63
+ 2304
64
+ ],
65
+ [
66
+ 1152,
67
+ 384
68
+ ],
69
+ [
70
+ 1152,
71
+ 768
72
+ ],
73
+ [
74
+ 1152,
75
+ 1152
76
+ ],
77
+ [
78
+ 1152,
79
+ 1536
80
+ ],
81
+ [
82
+ 1152,
83
+ 1920
84
+ ],
85
+ [
86
+ 1152,
87
+ 2304
88
+ ],
89
+ [
90
+ 1536,
91
+ 384
92
+ ],
93
+ [
94
+ 1536,
95
+ 768
96
+ ],
97
+ [
98
+ 1536,
99
+ 1152
100
+ ],
101
+ [
102
+ 1536,
103
+ 1536
104
+ ],
105
+ [
106
+ 1536,
107
+ 1920
108
+ ],
109
+ [
110
+ 1536,
111
+ 2304
112
+ ],
113
+ [
114
+ 1920,
115
+ 384
116
+ ],
117
+ [
118
+ 1920,
119
+ 768
120
+ ],
121
+ [
122
+ 1920,
123
+ 1152
124
+ ],
125
+ [
126
+ 1920,
127
+ 1536
128
+ ],
129
+ [
130
+ 1920,
131
+ 1920
132
+ ],
133
+ [
134
+ 1920,
135
+ 2304
136
+ ],
137
+ [
138
+ 2304,
139
+ 384
140
+ ],
141
+ [
142
+ 2304,
143
+ 768
144
+ ],
145
+ [
146
+ 2304,
147
+ 1152
148
+ ],
149
+ [
150
+ 2304,
151
+ 1536
152
+ ],
153
+ [
154
+ 2304,
155
+ 1920
156
+ ],
157
+ [
158
+ 2304,
159
+ 2304
160
+ ]
161
+ ],
162
+ "image_split_resolution": null,
163
+ "initializer_range": 0.02,
164
+ "intermediate_size": 18944,
165
+ "max_position_embeddings": 32768,
166
+ "max_window_layers": 28,
167
+ "mm_hidden_size": 1152,
168
+ "mm_newline_position": "grid",
169
+ "mm_patch_merge_type": "spatial_unpad",
170
+ "mm_projector_lr": null,
171
+ "mm_projector_type": "mlp2x_gelu",
172
+ "mm_resampler_type": null,
173
+ "mm_spatial_pool_mode": "bilinear",
174
+ "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
175
+ "mm_use_im_patch_token": false,
176
+ "mm_use_im_start_end": false,
177
+ "mm_vision_select_feature": "patch",
178
+ "mm_vision_select_layer": -2,
179
+ "mm_vision_tower": "google/siglip-so400m-patch14-384",
180
+ "mm_vision_tower_lr": 2e-06,
181
+ "model_type": "qwen2",
182
+ "num_attention_heads": 28,
183
+ "num_hidden_layers": 28,
184
+ "num_key_value_heads": 4,
185
+ "pos_skipping_range": 4096,
186
+ "rms_norm_eps": 1e-06,
187
+ "rope_scaling": null,
188
+ "rope_theta": 1000000.0,
189
+ "sliding_window": 131072,
190
+ "tie_word_embeddings": false,
191
+ "tokenizer_model_max_length": 32768,
192
+ "tokenizer_padding_side": "right",
193
+ "torch_dtype": "bfloat16",
194
+ "transformers_version": "4.40.0.dev0",
195
+ "use_cache": true,
196
+ "use_mm_proj": true,
197
+ "use_pos_skipping": false,
198
+ "use_sliding_window": false,
199
+ "vision_tower_pretrained": null,
200
+ "vocab_size": 152064
201
+ }