lucasjin commited on
Commit
08b3cf0
1 Parent(s): 0ef4bae

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,1048 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</cap>": 152670,
3
+ "</dcap>": 152674,
4
+ "</grounding>": 152676,
5
+ "</ncap>": 152672,
6
+ "</ocr>": 151668,
7
+ "</od>": 151666,
8
+ "</poly>": 152687,
9
+ "</proposal>": 152685,
10
+ "</region_cap>": 152681,
11
+ "</region_to_desciption>": 152683,
12
+ "</seg>": 152678,
13
+ "</tool_call>": 151658,
14
+ "<and>": 152688,
15
+ "<cap>": 152669,
16
+ "<dcap>": 152673,
17
+ "<grounding>": 152675,
18
+ "<loc_0>": 151669,
19
+ "<loc_100>": 151769,
20
+ "<loc_101>": 151770,
21
+ "<loc_102>": 151771,
22
+ "<loc_103>": 151772,
23
+ "<loc_104>": 151773,
24
+ "<loc_105>": 151774,
25
+ "<loc_106>": 151775,
26
+ "<loc_107>": 151776,
27
+ "<loc_108>": 151777,
28
+ "<loc_109>": 151778,
29
+ "<loc_10>": 151679,
30
+ "<loc_110>": 151779,
31
+ "<loc_111>": 151780,
32
+ "<loc_112>": 151781,
33
+ "<loc_113>": 151782,
34
+ "<loc_114>": 151783,
35
+ "<loc_115>": 151784,
36
+ "<loc_116>": 151785,
37
+ "<loc_117>": 151786,
38
+ "<loc_118>": 151787,
39
+ "<loc_119>": 151788,
40
+ "<loc_11>": 151680,
41
+ "<loc_120>": 151789,
42
+ "<loc_121>": 151790,
43
+ "<loc_122>": 151791,
44
+ "<loc_123>": 151792,
45
+ "<loc_124>": 151793,
46
+ "<loc_125>": 151794,
47
+ "<loc_126>": 151795,
48
+ "<loc_127>": 151796,
49
+ "<loc_128>": 151797,
50
+ "<loc_129>": 151798,
51
+ "<loc_12>": 151681,
52
+ "<loc_130>": 151799,
53
+ "<loc_131>": 151800,
54
+ "<loc_132>": 151801,
55
+ "<loc_133>": 151802,
56
+ "<loc_134>": 151803,
57
+ "<loc_135>": 151804,
58
+ "<loc_136>": 151805,
59
+ "<loc_137>": 151806,
60
+ "<loc_138>": 151807,
61
+ "<loc_139>": 151808,
62
+ "<loc_13>": 151682,
63
+ "<loc_140>": 151809,
64
+ "<loc_141>": 151810,
65
+ "<loc_142>": 151811,
66
+ "<loc_143>": 151812,
67
+ "<loc_144>": 151813,
68
+ "<loc_145>": 151814,
69
+ "<loc_146>": 151815,
70
+ "<loc_147>": 151816,
71
+ "<loc_148>": 151817,
72
+ "<loc_149>": 151818,
73
+ "<loc_14>": 151683,
74
+ "<loc_150>": 151819,
75
+ "<loc_151>": 151820,
76
+ "<loc_152>": 151821,
77
+ "<loc_153>": 151822,
78
+ "<loc_154>": 151823,
79
+ "<loc_155>": 151824,
80
+ "<loc_156>": 151825,
81
+ "<loc_157>": 151826,
82
+ "<loc_158>": 151827,
83
+ "<loc_159>": 151828,
84
+ "<loc_15>": 151684,
85
+ "<loc_160>": 151829,
86
+ "<loc_161>": 151830,
87
+ "<loc_162>": 151831,
88
+ "<loc_163>": 151832,
89
+ "<loc_164>": 151833,
90
+ "<loc_165>": 151834,
91
+ "<loc_166>": 151835,
92
+ "<loc_167>": 151836,
93
+ "<loc_168>": 151837,
94
+ "<loc_169>": 151838,
95
+ "<loc_16>": 151685,
96
+ "<loc_170>": 151839,
97
+ "<loc_171>": 151840,
98
+ "<loc_172>": 151841,
99
+ "<loc_173>": 151842,
100
+ "<loc_174>": 151843,
101
+ "<loc_175>": 151844,
102
+ "<loc_176>": 151845,
103
+ "<loc_177>": 151846,
104
+ "<loc_178>": 151847,
105
+ "<loc_179>": 151848,
106
+ "<loc_17>": 151686,
107
+ "<loc_180>": 151849,
108
+ "<loc_181>": 151850,
109
+ "<loc_182>": 151851,
110
+ "<loc_183>": 151852,
111
+ "<loc_184>": 151853,
112
+ "<loc_185>": 151854,
113
+ "<loc_186>": 151855,
114
+ "<loc_187>": 151856,
115
+ "<loc_188>": 151857,
116
+ "<loc_189>": 151858,
117
+ "<loc_18>": 151687,
118
+ "<loc_190>": 151859,
119
+ "<loc_191>": 151860,
120
+ "<loc_192>": 151861,
121
+ "<loc_193>": 151862,
122
+ "<loc_194>": 151863,
123
+ "<loc_195>": 151864,
124
+ "<loc_196>": 151865,
125
+ "<loc_197>": 151866,
126
+ "<loc_198>": 151867,
127
+ "<loc_199>": 151868,
128
+ "<loc_19>": 151688,
129
+ "<loc_1>": 151670,
130
+ "<loc_200>": 151869,
131
+ "<loc_201>": 151870,
132
+ "<loc_202>": 151871,
133
+ "<loc_203>": 151872,
134
+ "<loc_204>": 151873,
135
+ "<loc_205>": 151874,
136
+ "<loc_206>": 151875,
137
+ "<loc_207>": 151876,
138
+ "<loc_208>": 151877,
139
+ "<loc_209>": 151878,
140
+ "<loc_20>": 151689,
141
+ "<loc_210>": 151879,
142
+ "<loc_211>": 151880,
143
+ "<loc_212>": 151881,
144
+ "<loc_213>": 151882,
145
+ "<loc_214>": 151883,
146
+ "<loc_215>": 151884,
147
+ "<loc_216>": 151885,
148
+ "<loc_217>": 151886,
149
+ "<loc_218>": 151887,
150
+ "<loc_219>": 151888,
151
+ "<loc_21>": 151690,
152
+ "<loc_220>": 151889,
153
+ "<loc_221>": 151890,
154
+ "<loc_222>": 151891,
155
+ "<loc_223>": 151892,
156
+ "<loc_224>": 151893,
157
+ "<loc_225>": 151894,
158
+ "<loc_226>": 151895,
159
+ "<loc_227>": 151896,
160
+ "<loc_228>": 151897,
161
+ "<loc_229>": 151898,
162
+ "<loc_22>": 151691,
163
+ "<loc_230>": 151899,
164
+ "<loc_231>": 151900,
165
+ "<loc_232>": 151901,
166
+ "<loc_233>": 151902,
167
+ "<loc_234>": 151903,
168
+ "<loc_235>": 151904,
169
+ "<loc_236>": 151905,
170
+ "<loc_237>": 151906,
171
+ "<loc_238>": 151907,
172
+ "<loc_239>": 151908,
173
+ "<loc_23>": 151692,
174
+ "<loc_240>": 151909,
175
+ "<loc_241>": 151910,
176
+ "<loc_242>": 151911,
177
+ "<loc_243>": 151912,
178
+ "<loc_244>": 151913,
179
+ "<loc_245>": 151914,
180
+ "<loc_246>": 151915,
181
+ "<loc_247>": 151916,
182
+ "<loc_248>": 151917,
183
+ "<loc_249>": 151918,
184
+ "<loc_24>": 151693,
185
+ "<loc_250>": 151919,
186
+ "<loc_251>": 151920,
187
+ "<loc_252>": 151921,
188
+ "<loc_253>": 151922,
189
+ "<loc_254>": 151923,
190
+ "<loc_255>": 151924,
191
+ "<loc_256>": 151925,
192
+ "<loc_257>": 151926,
193
+ "<loc_258>": 151927,
194
+ "<loc_259>": 151928,
195
+ "<loc_25>": 151694,
196
+ "<loc_260>": 151929,
197
+ "<loc_261>": 151930,
198
+ "<loc_262>": 151931,
199
+ "<loc_263>": 151932,
200
+ "<loc_264>": 151933,
201
+ "<loc_265>": 151934,
202
+ "<loc_266>": 151935,
203
+ "<loc_267>": 151936,
204
+ "<loc_268>": 151937,
205
+ "<loc_269>": 151938,
206
+ "<loc_26>": 151695,
207
+ "<loc_270>": 151939,
208
+ "<loc_271>": 151940,
209
+ "<loc_272>": 151941,
210
+ "<loc_273>": 151942,
211
+ "<loc_274>": 151943,
212
+ "<loc_275>": 151944,
213
+ "<loc_276>": 151945,
214
+ "<loc_277>": 151946,
215
+ "<loc_278>": 151947,
216
+ "<loc_279>": 151948,
217
+ "<loc_27>": 151696,
218
+ "<loc_280>": 151949,
219
+ "<loc_281>": 151950,
220
+ "<loc_282>": 151951,
221
+ "<loc_283>": 151952,
222
+ "<loc_284>": 151953,
223
+ "<loc_285>": 151954,
224
+ "<loc_286>": 151955,
225
+ "<loc_287>": 151956,
226
+ "<loc_288>": 151957,
227
+ "<loc_289>": 151958,
228
+ "<loc_28>": 151697,
229
+ "<loc_290>": 151959,
230
+ "<loc_291>": 151960,
231
+ "<loc_292>": 151961,
232
+ "<loc_293>": 151962,
233
+ "<loc_294>": 151963,
234
+ "<loc_295>": 151964,
235
+ "<loc_296>": 151965,
236
+ "<loc_297>": 151966,
237
+ "<loc_298>": 151967,
238
+ "<loc_299>": 151968,
239
+ "<loc_29>": 151698,
240
+ "<loc_2>": 151671,
241
+ "<loc_300>": 151969,
242
+ "<loc_301>": 151970,
243
+ "<loc_302>": 151971,
244
+ "<loc_303>": 151972,
245
+ "<loc_304>": 151973,
246
+ "<loc_305>": 151974,
247
+ "<loc_306>": 151975,
248
+ "<loc_307>": 151976,
249
+ "<loc_308>": 151977,
250
+ "<loc_309>": 151978,
251
+ "<loc_30>": 151699,
252
+ "<loc_310>": 151979,
253
+ "<loc_311>": 151980,
254
+ "<loc_312>": 151981,
255
+ "<loc_313>": 151982,
256
+ "<loc_314>": 151983,
257
+ "<loc_315>": 151984,
258
+ "<loc_316>": 151985,
259
+ "<loc_317>": 151986,
260
+ "<loc_318>": 151987,
261
+ "<loc_319>": 151988,
262
+ "<loc_31>": 151700,
263
+ "<loc_320>": 151989,
264
+ "<loc_321>": 151990,
265
+ "<loc_322>": 151991,
266
+ "<loc_323>": 151992,
267
+ "<loc_324>": 151993,
268
+ "<loc_325>": 151994,
269
+ "<loc_326>": 151995,
270
+ "<loc_327>": 151996,
271
+ "<loc_328>": 151997,
272
+ "<loc_329>": 151998,
273
+ "<loc_32>": 151701,
274
+ "<loc_330>": 151999,
275
+ "<loc_331>": 152000,
276
+ "<loc_332>": 152001,
277
+ "<loc_333>": 152002,
278
+ "<loc_334>": 152003,
279
+ "<loc_335>": 152004,
280
+ "<loc_336>": 152005,
281
+ "<loc_337>": 152006,
282
+ "<loc_338>": 152007,
283
+ "<loc_339>": 152008,
284
+ "<loc_33>": 151702,
285
+ "<loc_340>": 152009,
286
+ "<loc_341>": 152010,
287
+ "<loc_342>": 152011,
288
+ "<loc_343>": 152012,
289
+ "<loc_344>": 152013,
290
+ "<loc_345>": 152014,
291
+ "<loc_346>": 152015,
292
+ "<loc_347>": 152016,
293
+ "<loc_348>": 152017,
294
+ "<loc_349>": 152018,
295
+ "<loc_34>": 151703,
296
+ "<loc_350>": 152019,
297
+ "<loc_351>": 152020,
298
+ "<loc_352>": 152021,
299
+ "<loc_353>": 152022,
300
+ "<loc_354>": 152023,
301
+ "<loc_355>": 152024,
302
+ "<loc_356>": 152025,
303
+ "<loc_357>": 152026,
304
+ "<loc_358>": 152027,
305
+ "<loc_359>": 152028,
306
+ "<loc_35>": 151704,
307
+ "<loc_360>": 152029,
308
+ "<loc_361>": 152030,
309
+ "<loc_362>": 152031,
310
+ "<loc_363>": 152032,
311
+ "<loc_364>": 152033,
312
+ "<loc_365>": 152034,
313
+ "<loc_366>": 152035,
314
+ "<loc_367>": 152036,
315
+ "<loc_368>": 152037,
316
+ "<loc_369>": 152038,
317
+ "<loc_36>": 151705,
318
+ "<loc_370>": 152039,
319
+ "<loc_371>": 152040,
320
+ "<loc_372>": 152041,
321
+ "<loc_373>": 152042,
322
+ "<loc_374>": 152043,
323
+ "<loc_375>": 152044,
324
+ "<loc_376>": 152045,
325
+ "<loc_377>": 152046,
326
+ "<loc_378>": 152047,
327
+ "<loc_379>": 152048,
328
+ "<loc_37>": 151706,
329
+ "<loc_380>": 152049,
330
+ "<loc_381>": 152050,
331
+ "<loc_382>": 152051,
332
+ "<loc_383>": 152052,
333
+ "<loc_384>": 152053,
334
+ "<loc_385>": 152054,
335
+ "<loc_386>": 152055,
336
+ "<loc_387>": 152056,
337
+ "<loc_388>": 152057,
338
+ "<loc_389>": 152058,
339
+ "<loc_38>": 151707,
340
+ "<loc_390>": 152059,
341
+ "<loc_391>": 152060,
342
+ "<loc_392>": 152061,
343
+ "<loc_393>": 152062,
344
+ "<loc_394>": 152063,
345
+ "<loc_395>": 152064,
346
+ "<loc_396>": 152065,
347
+ "<loc_397>": 152066,
348
+ "<loc_398>": 152067,
349
+ "<loc_399>": 152068,
350
+ "<loc_39>": 151708,
351
+ "<loc_3>": 151672,
352
+ "<loc_400>": 152069,
353
+ "<loc_401>": 152070,
354
+ "<loc_402>": 152071,
355
+ "<loc_403>": 152072,
356
+ "<loc_404>": 152073,
357
+ "<loc_405>": 152074,
358
+ "<loc_406>": 152075,
359
+ "<loc_407>": 152076,
360
+ "<loc_408>": 152077,
361
+ "<loc_409>": 152078,
362
+ "<loc_40>": 151709,
363
+ "<loc_410>": 152079,
364
+ "<loc_411>": 152080,
365
+ "<loc_412>": 152081,
366
+ "<loc_413>": 152082,
367
+ "<loc_414>": 152083,
368
+ "<loc_415>": 152084,
369
+ "<loc_416>": 152085,
370
+ "<loc_417>": 152086,
371
+ "<loc_418>": 152087,
372
+ "<loc_419>": 152088,
373
+ "<loc_41>": 151710,
374
+ "<loc_420>": 152089,
375
+ "<loc_421>": 152090,
376
+ "<loc_422>": 152091,
377
+ "<loc_423>": 152092,
378
+ "<loc_424>": 152093,
379
+ "<loc_425>": 152094,
380
+ "<loc_426>": 152095,
381
+ "<loc_427>": 152096,
382
+ "<loc_428>": 152097,
383
+ "<loc_429>": 152098,
384
+ "<loc_42>": 151711,
385
+ "<loc_430>": 152099,
386
+ "<loc_431>": 152100,
387
+ "<loc_432>": 152101,
388
+ "<loc_433>": 152102,
389
+ "<loc_434>": 152103,
390
+ "<loc_435>": 152104,
391
+ "<loc_436>": 152105,
392
+ "<loc_437>": 152106,
393
+ "<loc_438>": 152107,
394
+ "<loc_439>": 152108,
395
+ "<loc_43>": 151712,
396
+ "<loc_440>": 152109,
397
+ "<loc_441>": 152110,
398
+ "<loc_442>": 152111,
399
+ "<loc_443>": 152112,
400
+ "<loc_444>": 152113,
401
+ "<loc_445>": 152114,
402
+ "<loc_446>": 152115,
403
+ "<loc_447>": 152116,
404
+ "<loc_448>": 152117,
405
+ "<loc_449>": 152118,
406
+ "<loc_44>": 151713,
407
+ "<loc_450>": 152119,
408
+ "<loc_451>": 152120,
409
+ "<loc_452>": 152121,
410
+ "<loc_453>": 152122,
411
+ "<loc_454>": 152123,
412
+ "<loc_455>": 152124,
413
+ "<loc_456>": 152125,
414
+ "<loc_457>": 152126,
415
+ "<loc_458>": 152127,
416
+ "<loc_459>": 152128,
417
+ "<loc_45>": 151714,
418
+ "<loc_460>": 152129,
419
+ "<loc_461>": 152130,
420
+ "<loc_462>": 152131,
421
+ "<loc_463>": 152132,
422
+ "<loc_464>": 152133,
423
+ "<loc_465>": 152134,
424
+ "<loc_466>": 152135,
425
+ "<loc_467>": 152136,
426
+ "<loc_468>": 152137,
427
+ "<loc_469>": 152138,
428
+ "<loc_46>": 151715,
429
+ "<loc_470>": 152139,
430
+ "<loc_471>": 152140,
431
+ "<loc_472>": 152141,
432
+ "<loc_473>": 152142,
433
+ "<loc_474>": 152143,
434
+ "<loc_475>": 152144,
435
+ "<loc_476>": 152145,
436
+ "<loc_477>": 152146,
437
+ "<loc_478>": 152147,
438
+ "<loc_479>": 152148,
439
+ "<loc_47>": 151716,
440
+ "<loc_480>": 152149,
441
+ "<loc_481>": 152150,
442
+ "<loc_482>": 152151,
443
+ "<loc_483>": 152152,
444
+ "<loc_484>": 152153,
445
+ "<loc_485>": 152154,
446
+ "<loc_486>": 152155,
447
+ "<loc_487>": 152156,
448
+ "<loc_488>": 152157,
449
+ "<loc_489>": 152158,
450
+ "<loc_48>": 151717,
451
+ "<loc_490>": 152159,
452
+ "<loc_491>": 152160,
453
+ "<loc_492>": 152161,
454
+ "<loc_493>": 152162,
455
+ "<loc_494>": 152163,
456
+ "<loc_495>": 152164,
457
+ "<loc_496>": 152165,
458
+ "<loc_497>": 152166,
459
+ "<loc_498>": 152167,
460
+ "<loc_499>": 152168,
461
+ "<loc_49>": 151718,
462
+ "<loc_4>": 151673,
463
+ "<loc_500>": 152169,
464
+ "<loc_501>": 152170,
465
+ "<loc_502>": 152171,
466
+ "<loc_503>": 152172,
467
+ "<loc_504>": 152173,
468
+ "<loc_505>": 152174,
469
+ "<loc_506>": 152175,
470
+ "<loc_507>": 152176,
471
+ "<loc_508>": 152177,
472
+ "<loc_509>": 152178,
473
+ "<loc_50>": 151719,
474
+ "<loc_510>": 152179,
475
+ "<loc_511>": 152180,
476
+ "<loc_512>": 152181,
477
+ "<loc_513>": 152182,
478
+ "<loc_514>": 152183,
479
+ "<loc_515>": 152184,
480
+ "<loc_516>": 152185,
481
+ "<loc_517>": 152186,
482
+ "<loc_518>": 152187,
483
+ "<loc_519>": 152188,
484
+ "<loc_51>": 151720,
485
+ "<loc_520>": 152189,
486
+ "<loc_521>": 152190,
487
+ "<loc_522>": 152191,
488
+ "<loc_523>": 152192,
489
+ "<loc_524>": 152193,
490
+ "<loc_525>": 152194,
491
+ "<loc_526>": 152195,
492
+ "<loc_527>": 152196,
493
+ "<loc_528>": 152197,
494
+ "<loc_529>": 152198,
495
+ "<loc_52>": 151721,
496
+ "<loc_530>": 152199,
497
+ "<loc_531>": 152200,
498
+ "<loc_532>": 152201,
499
+ "<loc_533>": 152202,
500
+ "<loc_534>": 152203,
501
+ "<loc_535>": 152204,
502
+ "<loc_536>": 152205,
503
+ "<loc_537>": 152206,
504
+ "<loc_538>": 152207,
505
+ "<loc_539>": 152208,
506
+ "<loc_53>": 151722,
507
+ "<loc_540>": 152209,
508
+ "<loc_541>": 152210,
509
+ "<loc_542>": 152211,
510
+ "<loc_543>": 152212,
511
+ "<loc_544>": 152213,
512
+ "<loc_545>": 152214,
513
+ "<loc_546>": 152215,
514
+ "<loc_547>": 152216,
515
+ "<loc_548>": 152217,
516
+ "<loc_549>": 152218,
517
+ "<loc_54>": 151723,
518
+ "<loc_550>": 152219,
519
+ "<loc_551>": 152220,
520
+ "<loc_552>": 152221,
521
+ "<loc_553>": 152222,
522
+ "<loc_554>": 152223,
523
+ "<loc_555>": 152224,
524
+ "<loc_556>": 152225,
525
+ "<loc_557>": 152226,
526
+ "<loc_558>": 152227,
527
+ "<loc_559>": 152228,
528
+ "<loc_55>": 151724,
529
+ "<loc_560>": 152229,
530
+ "<loc_561>": 152230,
531
+ "<loc_562>": 152231,
532
+ "<loc_563>": 152232,
533
+ "<loc_564>": 152233,
534
+ "<loc_565>": 152234,
535
+ "<loc_566>": 152235,
536
+ "<loc_567>": 152236,
537
+ "<loc_568>": 152237,
538
+ "<loc_569>": 152238,
539
+ "<loc_56>": 151725,
540
+ "<loc_570>": 152239,
541
+ "<loc_571>": 152240,
542
+ "<loc_572>": 152241,
543
+ "<loc_573>": 152242,
544
+ "<loc_574>": 152243,
545
+ "<loc_575>": 152244,
546
+ "<loc_576>": 152245,
547
+ "<loc_577>": 152246,
548
+ "<loc_578>": 152247,
549
+ "<loc_579>": 152248,
550
+ "<loc_57>": 151726,
551
+ "<loc_580>": 152249,
552
+ "<loc_581>": 152250,
553
+ "<loc_582>": 152251,
554
+ "<loc_583>": 152252,
555
+ "<loc_584>": 152253,
556
+ "<loc_585>": 152254,
557
+ "<loc_586>": 152255,
558
+ "<loc_587>": 152256,
559
+ "<loc_588>": 152257,
560
+ "<loc_589>": 152258,
561
+ "<loc_58>": 151727,
562
+ "<loc_590>": 152259,
563
+ "<loc_591>": 152260,
564
+ "<loc_592>": 152261,
565
+ "<loc_593>": 152262,
566
+ "<loc_594>": 152263,
567
+ "<loc_595>": 152264,
568
+ "<loc_596>": 152265,
569
+ "<loc_597>": 152266,
570
+ "<loc_598>": 152267,
571
+ "<loc_599>": 152268,
572
+ "<loc_59>": 151728,
573
+ "<loc_5>": 151674,
574
+ "<loc_600>": 152269,
575
+ "<loc_601>": 152270,
576
+ "<loc_602>": 152271,
577
+ "<loc_603>": 152272,
578
+ "<loc_604>": 152273,
579
+ "<loc_605>": 152274,
580
+ "<loc_606>": 152275,
581
+ "<loc_607>": 152276,
582
+ "<loc_608>": 152277,
583
+ "<loc_609>": 152278,
584
+ "<loc_60>": 151729,
585
+ "<loc_610>": 152279,
586
+ "<loc_611>": 152280,
587
+ "<loc_612>": 152281,
588
+ "<loc_613>": 152282,
589
+ "<loc_614>": 152283,
590
+ "<loc_615>": 152284,
591
+ "<loc_616>": 152285,
592
+ "<loc_617>": 152286,
593
+ "<loc_618>": 152287,
594
+ "<loc_619>": 152288,
595
+ "<loc_61>": 151730,
596
+ "<loc_620>": 152289,
597
+ "<loc_621>": 152290,
598
+ "<loc_622>": 152291,
599
+ "<loc_623>": 152292,
600
+ "<loc_624>": 152293,
601
+ "<loc_625>": 152294,
602
+ "<loc_626>": 152295,
603
+ "<loc_627>": 152296,
604
+ "<loc_628>": 152297,
605
+ "<loc_629>": 152298,
606
+ "<loc_62>": 151731,
607
+ "<loc_630>": 152299,
608
+ "<loc_631>": 152300,
609
+ "<loc_632>": 152301,
610
+ "<loc_633>": 152302,
611
+ "<loc_634>": 152303,
612
+ "<loc_635>": 152304,
613
+ "<loc_636>": 152305,
614
+ "<loc_637>": 152306,
615
+ "<loc_638>": 152307,
616
+ "<loc_639>": 152308,
617
+ "<loc_63>": 151732,
618
+ "<loc_640>": 152309,
619
+ "<loc_641>": 152310,
620
+ "<loc_642>": 152311,
621
+ "<loc_643>": 152312,
622
+ "<loc_644>": 152313,
623
+ "<loc_645>": 152314,
624
+ "<loc_646>": 152315,
625
+ "<loc_647>": 152316,
626
+ "<loc_648>": 152317,
627
+ "<loc_649>": 152318,
628
+ "<loc_64>": 151733,
629
+ "<loc_650>": 152319,
630
+ "<loc_651>": 152320,
631
+ "<loc_652>": 152321,
632
+ "<loc_653>": 152322,
633
+ "<loc_654>": 152323,
634
+ "<loc_655>": 152324,
635
+ "<loc_656>": 152325,
636
+ "<loc_657>": 152326,
637
+ "<loc_658>": 152327,
638
+ "<loc_659>": 152328,
639
+ "<loc_65>": 151734,
640
+ "<loc_660>": 152329,
641
+ "<loc_661>": 152330,
642
+ "<loc_662>": 152331,
643
+ "<loc_663>": 152332,
644
+ "<loc_664>": 152333,
645
+ "<loc_665>": 152334,
646
+ "<loc_666>": 152335,
647
+ "<loc_667>": 152336,
648
+ "<loc_668>": 152337,
649
+ "<loc_669>": 152338,
650
+ "<loc_66>": 151735,
651
+ "<loc_670>": 152339,
652
+ "<loc_671>": 152340,
653
+ "<loc_672>": 152341,
654
+ "<loc_673>": 152342,
655
+ "<loc_674>": 152343,
656
+ "<loc_675>": 152344,
657
+ "<loc_676>": 152345,
658
+ "<loc_677>": 152346,
659
+ "<loc_678>": 152347,
660
+ "<loc_679>": 152348,
661
+ "<loc_67>": 151736,
662
+ "<loc_680>": 152349,
663
+ "<loc_681>": 152350,
664
+ "<loc_682>": 152351,
665
+ "<loc_683>": 152352,
666
+ "<loc_684>": 152353,
667
+ "<loc_685>": 152354,
668
+ "<loc_686>": 152355,
669
+ "<loc_687>": 152356,
670
+ "<loc_688>": 152357,
671
+ "<loc_689>": 152358,
672
+ "<loc_68>": 151737,
673
+ "<loc_690>": 152359,
674
+ "<loc_691>": 152360,
675
+ "<loc_692>": 152361,
676
+ "<loc_693>": 152362,
677
+ "<loc_694>": 152363,
678
+ "<loc_695>": 152364,
679
+ "<loc_696>": 152365,
680
+ "<loc_697>": 152366,
681
+ "<loc_698>": 152367,
682
+ "<loc_699>": 152368,
683
+ "<loc_69>": 151738,
684
+ "<loc_6>": 151675,
685
+ "<loc_700>": 152369,
686
+ "<loc_701>": 152370,
687
+ "<loc_702>": 152371,
688
+ "<loc_703>": 152372,
689
+ "<loc_704>": 152373,
690
+ "<loc_705>": 152374,
691
+ "<loc_706>": 152375,
692
+ "<loc_707>": 152376,
693
+ "<loc_708>": 152377,
694
+ "<loc_709>": 152378,
695
+ "<loc_70>": 151739,
696
+ "<loc_710>": 152379,
697
+ "<loc_711>": 152380,
698
+ "<loc_712>": 152381,
699
+ "<loc_713>": 152382,
700
+ "<loc_714>": 152383,
701
+ "<loc_715>": 152384,
702
+ "<loc_716>": 152385,
703
+ "<loc_717>": 152386,
704
+ "<loc_718>": 152387,
705
+ "<loc_719>": 152388,
706
+ "<loc_71>": 151740,
707
+ "<loc_720>": 152389,
708
+ "<loc_721>": 152390,
709
+ "<loc_722>": 152391,
710
+ "<loc_723>": 152392,
711
+ "<loc_724>": 152393,
712
+ "<loc_725>": 152394,
713
+ "<loc_726>": 152395,
714
+ "<loc_727>": 152396,
715
+ "<loc_728>": 152397,
716
+ "<loc_729>": 152398,
717
+ "<loc_72>": 151741,
718
+ "<loc_730>": 152399,
719
+ "<loc_731>": 152400,
720
+ "<loc_732>": 152401,
721
+ "<loc_733>": 152402,
722
+ "<loc_734>": 152403,
723
+ "<loc_735>": 152404,
724
+ "<loc_736>": 152405,
725
+ "<loc_737>": 152406,
726
+ "<loc_738>": 152407,
727
+ "<loc_739>": 152408,
728
+ "<loc_73>": 151742,
729
+ "<loc_740>": 152409,
730
+ "<loc_741>": 152410,
731
+ "<loc_742>": 152411,
732
+ "<loc_743>": 152412,
733
+ "<loc_744>": 152413,
734
+ "<loc_745>": 152414,
735
+ "<loc_746>": 152415,
736
+ "<loc_747>": 152416,
737
+ "<loc_748>": 152417,
738
+ "<loc_749>": 152418,
739
+ "<loc_74>": 151743,
740
+ "<loc_750>": 152419,
741
+ "<loc_751>": 152420,
742
+ "<loc_752>": 152421,
743
+ "<loc_753>": 152422,
744
+ "<loc_754>": 152423,
745
+ "<loc_755>": 152424,
746
+ "<loc_756>": 152425,
747
+ "<loc_757>": 152426,
748
+ "<loc_758>": 152427,
749
+ "<loc_759>": 152428,
750
+ "<loc_75>": 151744,
751
+ "<loc_760>": 152429,
752
+ "<loc_761>": 152430,
753
+ "<loc_762>": 152431,
754
+ "<loc_763>": 152432,
755
+ "<loc_764>": 152433,
756
+ "<loc_765>": 152434,
757
+ "<loc_766>": 152435,
758
+ "<loc_767>": 152436,
759
+ "<loc_768>": 152437,
760
+ "<loc_769>": 152438,
761
+ "<loc_76>": 151745,
762
+ "<loc_770>": 152439,
763
+ "<loc_771>": 152440,
764
+ "<loc_772>": 152441,
765
+ "<loc_773>": 152442,
766
+ "<loc_774>": 152443,
767
+ "<loc_775>": 152444,
768
+ "<loc_776>": 152445,
769
+ "<loc_777>": 152446,
770
+ "<loc_778>": 152447,
771
+ "<loc_779>": 152448,
772
+ "<loc_77>": 151746,
773
+ "<loc_780>": 152449,
774
+ "<loc_781>": 152450,
775
+ "<loc_782>": 152451,
776
+ "<loc_783>": 152452,
777
+ "<loc_784>": 152453,
778
+ "<loc_785>": 152454,
779
+ "<loc_786>": 152455,
780
+ "<loc_787>": 152456,
781
+ "<loc_788>": 152457,
782
+ "<loc_789>": 152458,
783
+ "<loc_78>": 151747,
784
+ "<loc_790>": 152459,
785
+ "<loc_791>": 152460,
786
+ "<loc_792>": 152461,
787
+ "<loc_793>": 152462,
788
+ "<loc_794>": 152463,
789
+ "<loc_795>": 152464,
790
+ "<loc_796>": 152465,
791
+ "<loc_797>": 152466,
792
+ "<loc_798>": 152467,
793
+ "<loc_799>": 152468,
794
+ "<loc_79>": 151748,
795
+ "<loc_7>": 151676,
796
+ "<loc_800>": 152469,
797
+ "<loc_801>": 152470,
798
+ "<loc_802>": 152471,
799
+ "<loc_803>": 152472,
800
+ "<loc_804>": 152473,
801
+ "<loc_805>": 152474,
802
+ "<loc_806>": 152475,
803
+ "<loc_807>": 152476,
804
+ "<loc_808>": 152477,
805
+ "<loc_809>": 152478,
806
+ "<loc_80>": 151749,
807
+ "<loc_810>": 152479,
808
+ "<loc_811>": 152480,
809
+ "<loc_812>": 152481,
810
+ "<loc_813>": 152482,
811
+ "<loc_814>": 152483,
812
+ "<loc_815>": 152484,
813
+ "<loc_816>": 152485,
814
+ "<loc_817>": 152486,
815
+ "<loc_818>": 152487,
816
+ "<loc_819>": 152488,
817
+ "<loc_81>": 151750,
818
+ "<loc_820>": 152489,
819
+ "<loc_821>": 152490,
820
+ "<loc_822>": 152491,
821
+ "<loc_823>": 152492,
822
+ "<loc_824>": 152493,
823
+ "<loc_825>": 152494,
824
+ "<loc_826>": 152495,
825
+ "<loc_827>": 152496,
826
+ "<loc_828>": 152497,
827
+ "<loc_829>": 152498,
828
+ "<loc_82>": 151751,
829
+ "<loc_830>": 152499,
830
+ "<loc_831>": 152500,
831
+ "<loc_832>": 152501,
832
+ "<loc_833>": 152502,
833
+ "<loc_834>": 152503,
834
+ "<loc_835>": 152504,
835
+ "<loc_836>": 152505,
836
+ "<loc_837>": 152506,
837
+ "<loc_838>": 152507,
838
+ "<loc_839>": 152508,
839
+ "<loc_83>": 151752,
840
+ "<loc_840>": 152509,
841
+ "<loc_841>": 152510,
842
+ "<loc_842>": 152511,
843
+ "<loc_843>": 152512,
844
+ "<loc_844>": 152513,
845
+ "<loc_845>": 152514,
846
+ "<loc_846>": 152515,
847
+ "<loc_847>": 152516,
848
+ "<loc_848>": 152517,
849
+ "<loc_849>": 152518,
850
+ "<loc_84>": 151753,
851
+ "<loc_850>": 152519,
852
+ "<loc_851>": 152520,
853
+ "<loc_852>": 152521,
854
+ "<loc_853>": 152522,
855
+ "<loc_854>": 152523,
856
+ "<loc_855>": 152524,
857
+ "<loc_856>": 152525,
858
+ "<loc_857>": 152526,
859
+ "<loc_858>": 152527,
860
+ "<loc_859>": 152528,
861
+ "<loc_85>": 151754,
862
+ "<loc_860>": 152529,
863
+ "<loc_861>": 152530,
864
+ "<loc_862>": 152531,
865
+ "<loc_863>": 152532,
866
+ "<loc_864>": 152533,
867
+ "<loc_865>": 152534,
868
+ "<loc_866>": 152535,
869
+ "<loc_867>": 152536,
870
+ "<loc_868>": 152537,
871
+ "<loc_869>": 152538,
872
+ "<loc_86>": 151755,
873
+ "<loc_870>": 152539,
874
+ "<loc_871>": 152540,
875
+ "<loc_872>": 152541,
876
+ "<loc_873>": 152542,
877
+ "<loc_874>": 152543,
878
+ "<loc_875>": 152544,
879
+ "<loc_876>": 152545,
880
+ "<loc_877>": 152546,
881
+ "<loc_878>": 152547,
882
+ "<loc_879>": 152548,
883
+ "<loc_87>": 151756,
884
+ "<loc_880>": 152549,
885
+ "<loc_881>": 152550,
886
+ "<loc_882>": 152551,
887
+ "<loc_883>": 152552,
888
+ "<loc_884>": 152553,
889
+ "<loc_885>": 152554,
890
+ "<loc_886>": 152555,
891
+ "<loc_887>": 152556,
892
+ "<loc_888>": 152557,
893
+ "<loc_889>": 152558,
894
+ "<loc_88>": 151757,
895
+ "<loc_890>": 152559,
896
+ "<loc_891>": 152560,
897
+ "<loc_892>": 152561,
898
+ "<loc_893>": 152562,
899
+ "<loc_894>": 152563,
900
+ "<loc_895>": 152564,
901
+ "<loc_896>": 152565,
902
+ "<loc_897>": 152566,
903
+ "<loc_898>": 152567,
904
+ "<loc_899>": 152568,
905
+ "<loc_89>": 151758,
906
+ "<loc_8>": 151677,
907
+ "<loc_900>": 152569,
908
+ "<loc_901>": 152570,
909
+ "<loc_902>": 152571,
910
+ "<loc_903>": 152572,
911
+ "<loc_904>": 152573,
912
+ "<loc_905>": 152574,
913
+ "<loc_906>": 152575,
914
+ "<loc_907>": 152576,
915
+ "<loc_908>": 152577,
916
+ "<loc_909>": 152578,
917
+ "<loc_90>": 151759,
918
+ "<loc_910>": 152579,
919
+ "<loc_911>": 152580,
920
+ "<loc_912>": 152581,
921
+ "<loc_913>": 152582,
922
+ "<loc_914>": 152583,
923
+ "<loc_915>": 152584,
924
+ "<loc_916>": 152585,
925
+ "<loc_917>": 152586,
926
+ "<loc_918>": 152587,
927
+ "<loc_919>": 152588,
928
+ "<loc_91>": 151760,
929
+ "<loc_920>": 152589,
930
+ "<loc_921>": 152590,
931
+ "<loc_922>": 152591,
932
+ "<loc_923>": 152592,
933
+ "<loc_924>": 152593,
934
+ "<loc_925>": 152594,
935
+ "<loc_926>": 152595,
936
+ "<loc_927>": 152596,
937
+ "<loc_928>": 152597,
938
+ "<loc_929>": 152598,
939
+ "<loc_92>": 151761,
940
+ "<loc_930>": 152599,
941
+ "<loc_931>": 152600,
942
+ "<loc_932>": 152601,
943
+ "<loc_933>": 152602,
944
+ "<loc_934>": 152603,
945
+ "<loc_935>": 152604,
946
+ "<loc_936>": 152605,
947
+ "<loc_937>": 152606,
948
+ "<loc_938>": 152607,
949
+ "<loc_939>": 152608,
950
+ "<loc_93>": 151762,
951
+ "<loc_940>": 152609,
952
+ "<loc_941>": 152610,
953
+ "<loc_942>": 152611,
954
+ "<loc_943>": 152612,
955
+ "<loc_944>": 152613,
956
+ "<loc_945>": 152614,
957
+ "<loc_946>": 152615,
958
+ "<loc_947>": 152616,
959
+ "<loc_948>": 152617,
960
+ "<loc_949>": 152618,
961
+ "<loc_94>": 151763,
962
+ "<loc_950>": 152619,
963
+ "<loc_951>": 152620,
964
+ "<loc_952>": 152621,
965
+ "<loc_953>": 152622,
966
+ "<loc_954>": 152623,
967
+ "<loc_955>": 152624,
968
+ "<loc_956>": 152625,
969
+ "<loc_957>": 152626,
970
+ "<loc_958>": 152627,
971
+ "<loc_959>": 152628,
972
+ "<loc_95>": 151764,
973
+ "<loc_960>": 152629,
974
+ "<loc_961>": 152630,
975
+ "<loc_962>": 152631,
976
+ "<loc_963>": 152632,
977
+ "<loc_964>": 152633,
978
+ "<loc_965>": 152634,
979
+ "<loc_966>": 152635,
980
+ "<loc_967>": 152636,
981
+ "<loc_968>": 152637,
982
+ "<loc_969>": 152638,
983
+ "<loc_96>": 151765,
984
+ "<loc_970>": 152639,
985
+ "<loc_971>": 152640,
986
+ "<loc_972>": 152641,
987
+ "<loc_973>": 152642,
988
+ "<loc_974>": 152643,
989
+ "<loc_975>": 152644,
990
+ "<loc_976>": 152645,
991
+ "<loc_977>": 152646,
992
+ "<loc_978>": 152647,
993
+ "<loc_979>": 152648,
994
+ "<loc_97>": 151766,
995
+ "<loc_980>": 152649,
996
+ "<loc_981>": 152650,
997
+ "<loc_982>": 152651,
998
+ "<loc_983>": 152652,
999
+ "<loc_984>": 152653,
1000
+ "<loc_985>": 152654,
1001
+ "<loc_986>": 152655,
1002
+ "<loc_987>": 152656,
1003
+ "<loc_988>": 152657,
1004
+ "<loc_989>": 152658,
1005
+ "<loc_98>": 151767,
1006
+ "<loc_990>": 152659,
1007
+ "<loc_991>": 152660,
1008
+ "<loc_992>": 152661,
1009
+ "<loc_993>": 152662,
1010
+ "<loc_994>": 152663,
1011
+ "<loc_995>": 152664,
1012
+ "<loc_996>": 152665,
1013
+ "<loc_997>": 152666,
1014
+ "<loc_998>": 152667,
1015
+ "<loc_999>": 152668,
1016
+ "<loc_99>": 151768,
1017
+ "<loc_9>": 151678,
1018
+ "<ncap>": 152671,
1019
+ "<ocr>": 151667,
1020
+ "<od>": 151665,
1021
+ "<poly>": 152686,
1022
+ "<proposal>": 152684,
1023
+ "<region_cap>": 152680,
1024
+ "<region_to_desciption>": 152682,
1025
+ "<seg>": 152677,
1026
+ "<sep>": 152679,
1027
+ "<tool_call>": 151657,
1028
+ "<|box_end|>": 151649,
1029
+ "<|box_start|>": 151648,
1030
+ "<|endoftext|>": 151643,
1031
+ "<|file_sep|>": 151664,
1032
+ "<|fim_middle|>": 151660,
1033
+ "<|fim_pad|>": 151662,
1034
+ "<|fim_prefix|>": 151659,
1035
+ "<|fim_suffix|>": 151661,
1036
+ "<|im_end|>": 151645,
1037
+ "<|im_start|>": 151644,
1038
+ "<|image_pad|>": 151655,
1039
+ "<|object_ref_end|>": 151647,
1040
+ "<|object_ref_start|>": 151646,
1041
+ "<|quad_end|>": 151651,
1042
+ "<|quad_start|>": 151650,
1043
+ "<|repo_name|>": 151663,
1044
+ "<|video_pad|>": 151656,
1045
+ "<|vision_end|>": 151653,
1046
+ "<|vision_pad|>": 151654,
1047
+ "<|vision_start|>": 151652
1048
+ }
config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "checkpoints/mono-ve-3b-v0/checkpoint-7400/",
3
+ "architectures": [
4
+ "MonoForConditionalGeneration"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_aimv2.MonoConfig",
9
+ "AutoModel": "modeling_mono.MonoForConditionalGeneration",
10
+ "AutoModelForCausalLM": "modeling_mono.MonoForConditionalGeneration"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151645,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 1536,
16
+ "ignore_index": -100,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 8960,
19
+ "max_position_embeddings": 32768,
20
+ "max_window_layers": 21,
21
+ "model_type": "mono",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 28,
24
+ "num_key_value_heads": 2,
25
+ "rms_norm_eps": 1e-06,
26
+ "rope_scaling": null,
27
+ "rope_theta": 1000000.0,
28
+ "sliding_window": null,
29
+ "tie_word_embeddings": true,
30
+ "torch_dtype": "bfloat16",
31
+ "transformers_version": "4.46.2",
32
+ "use_bias": false,
33
+ "use_cache": false,
34
+ "use_sliding_window": false,
35
+ "vision_config": {
36
+ "_attn_implementation_autoset": true,
37
+ "_name_or_path": "",
38
+ "add_cross_attention": false,
39
+ "architectures": null,
40
+ "attention_dropout": 0.0,
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": null,
44
+ "chunk_size_feed_forward": 0,
45
+ "cross_attention_hidden_size": null,
46
+ "decoder_start_token_id": null,
47
+ "diversity_penalty": 0.0,
48
+ "do_sample": false,
49
+ "early_stopping": false,
50
+ "encoder_no_repeat_ngram_size": 0,
51
+ "eos_token_id": null,
52
+ "exponential_decay_length_penalty": null,
53
+ "finetuning_task": null,
54
+ "forced_bos_token_id": null,
55
+ "forced_eos_token_id": null,
56
+ "hidden_size": 3072,
57
+ "id2label": {
58
+ "0": "LABEL_0",
59
+ "1": "LABEL_1"
60
+ },
61
+ "image_size": 448,
62
+ "intermediate_size": 8192,
63
+ "is_decoder": false,
64
+ "is_encoder_decoder": false,
65
+ "label2id": {
66
+ "LABEL_0": 0,
67
+ "LABEL_1": 1
68
+ },
69
+ "length_penalty": 1.0,
70
+ "max_length": 20,
71
+ "min_length": 0,
72
+ "model_type": "aimv2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 24,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_channels": 3,
78
+ "num_hidden_layers": 24,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "patch_size": 14,
85
+ "prefix": null,
86
+ "problem_type": null,
87
+ "projection_dropout": 0.0,
88
+ "pruned_heads": {},
89
+ "qkv_bias": false,
90
+ "remove_invalid_values": false,
91
+ "repetition_penalty": 1.0,
92
+ "return_dict": true,
93
+ "return_dict_in_generate": false,
94
+ "rms_norm_eps": 1e-05,
95
+ "sep_token_id": null,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": null,
106
+ "torchscript": false,
107
+ "typical_p": 1.0,
108
+ "use_bfloat16": false,
109
+ "use_bias": false
110
+ },
111
+ "vocab_size": 151936
112
+ }
configuration_aimv2.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers import Qwen2Config
5
+
6
+ __all__ = ["AIMv2Config", "MonoConfig"]
7
+
8
+
9
+ class AIMv2Config(PretrainedConfig):
10
+
11
+ model_type: str = "aimv2"
12
+
13
+ def __init__(
14
+ self,
15
+ hidden_size: int = 1024,
16
+ intermediate_size: int = 2816,
17
+ num_hidden_layers: int = 24,
18
+ num_attention_heads: int = 8,
19
+ num_channels: int = 3,
20
+ image_size: int = 224,
21
+ patch_size: int = 14,
22
+ rms_norm_eps: float = 1e-5,
23
+ attention_dropout: float = 0.0,
24
+ projection_dropout: float = 0.0,
25
+ qkv_bias: bool = False,
26
+ use_bias: bool = False,
27
+ text_config=None,
28
+ **kwargs: Any,
29
+ ):
30
+ super().__init__(**kwargs)
31
+ self.hidden_size = hidden_size
32
+ self.intermediate_size = intermediate_size
33
+ self.num_hidden_layers = num_hidden_layers
34
+ self.num_attention_heads = num_attention_heads
35
+ self.num_channels = num_channels
36
+ self.patch_size = patch_size
37
+ self.image_size = image_size
38
+ self.attention_dropout = attention_dropout
39
+ self.rms_norm_eps = rms_norm_eps
40
+
41
+ self.projection_dropout = projection_dropout
42
+ self.qkv_bias = qkv_bias
43
+ self.use_bias = use_bias
44
+
45
+
46
+ class MonoConfig(Qwen2Config):
47
+
48
+ model_type = "mono"
49
+ is_composition = False
50
+
51
+ def __init__(
52
+ self,
53
+ vision_config=None,
54
+ ignore_index=-100,
55
+ **kwargs,
56
+ ):
57
+ self.ignore_index = ignore_index
58
+ if vision_config is not None:
59
+ vision_config = AIMv2Config(**vision_config)
60
+ self.vision_config = vision_config
61
+
62
+ super().__init__(**kwargs)
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "4.46.2",
6
+ "use_cache": false
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a35a90d9e2f350fbed0c43707bbc51e1ca4b7605512f054b571bbb06bf1b45b
3
+ size 4993038848
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfee8731806f389a71319d02d93a67d6ab5520a27a440e3743117e8a55f14dc9
3
+ size 3634863200
model.safetensors.index.json ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8627842048
4
+ },
5
+ "weight_map": {
6
+ "mm_projector.linear_proj.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
7
+ "mm_projector.linear_proj.dense_h_to_4h.weight": "model-00002-of-00002.safetensors",
8
+ "mm_projector.linear_proj.gate_proj.weight": "model-00002-of-00002.safetensors",
9
+ "mm_projector.linear_proj.linear_proj.weight": "model-00002-of-00002.safetensors",
10
+ "mm_projector.linear_proj.norm1.bias": "model-00002-of-00002.safetensors",
11
+ "mm_projector.linear_proj.norm1.weight": "model-00002-of-00002.safetensors",
12
+ "model.embed_tokens.weight": "model-00002-of-00002.safetensors",
13
+ "model.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors",
14
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
15
+ "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
16
+ "model.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
17
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
18
+ "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
19
+ "model.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
20
+ "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
21
+ "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
22
+ "model.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
23
+ "model.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
24
+ "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
25
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors",
26
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
27
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
28
+ "model.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
29
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
30
+ "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
31
+ "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
32
+ "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
33
+ "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
34
+ "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
35
+ "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
36
+ "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
37
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
38
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
39
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
40
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
41
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
42
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
43
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
44
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
45
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
46
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
47
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
48
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
49
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors",
50
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
51
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
52
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
53
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
54
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
55
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
56
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
57
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
58
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
59
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
60
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
61
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
62
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
63
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
64
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
65
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
66
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
67
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
68
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
69
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
70
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
71
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
72
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
73
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00002.safetensors",
74
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
75
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
76
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
77
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
78
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
79
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
80
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
81
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
82
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
83
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
84
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
85
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
86
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
87
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
88
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
89
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
90
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
91
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
92
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
93
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
94
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
95
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
96
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
97
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
98
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
99
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
100
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
101
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
102
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
103
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
104
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
105
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
106
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
107
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
108
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
109
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
110
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
111
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
112
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
113
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
114
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
115
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
116
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
117
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
118
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
119
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
120
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
121
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
122
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
123
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
124
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
125
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
126
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
127
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
128
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
129
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
130
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
131
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
132
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
133
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
134
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
135
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
136
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
137
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
138
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
139
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
140
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
141
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
142
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
143
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
144
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
145
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
146
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
147
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
148
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
149
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
150
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
151
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
154
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
156
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors",
158
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
159
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
160
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
161
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
162
+ "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
163
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
166
+ "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
168
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
175
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
178
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
180
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
187
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
190
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
192
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
199
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
202
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
204
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
211
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
213
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
214
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
216
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
223
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
226
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
228
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
235
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
238
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
240
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
247
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
250
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
252
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
259
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
262
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
263
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
264
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors",
266
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
267
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
268
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
269
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
270
+ "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
271
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
272
+ "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
274
+ "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
276
+ "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
283
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
286
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
288
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
295
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
298
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
300
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
307
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
310
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
312
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
319
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
322
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
324
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
331
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
334
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
336
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
340
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
341
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
343
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
344
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
346
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
348
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
349
+ "model.norm.weight": "model-00002-of-00002.safetensors",
350
+ "vision_tower.preprocessor.patchifier.norm.weight": "model-00001-of-00002.safetensors",
351
+ "vision_tower.preprocessor.patchifier.proj.bias": "model-00001-of-00002.safetensors",
352
+ "vision_tower.preprocessor.patchifier.proj.weight": "model-00001-of-00002.safetensors",
353
+ "vision_tower.preprocessor.pos_embed": "model-00001-of-00002.safetensors",
354
+ "vision_tower.trunk.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
355
+ "vision_tower.trunk.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
356
+ "vision_tower.trunk.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
357
+ "vision_tower.trunk.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
358
+ "vision_tower.trunk.blocks.0.mlp.fc3.weight": "model-00001-of-00002.safetensors",
359
+ "vision_tower.trunk.blocks.0.norm_1.weight": "model-00001-of-00002.safetensors",
360
+ "vision_tower.trunk.blocks.0.norm_2.weight": "model-00001-of-00002.safetensors",
361
+ "vision_tower.trunk.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
362
+ "vision_tower.trunk.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
363
+ "vision_tower.trunk.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
364
+ "vision_tower.trunk.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
365
+ "vision_tower.trunk.blocks.1.mlp.fc3.weight": "model-00001-of-00002.safetensors",
366
+ "vision_tower.trunk.blocks.1.norm_1.weight": "model-00001-of-00002.safetensors",
367
+ "vision_tower.trunk.blocks.1.norm_2.weight": "model-00001-of-00002.safetensors",
368
+ "vision_tower.trunk.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
369
+ "vision_tower.trunk.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
370
+ "vision_tower.trunk.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
371
+ "vision_tower.trunk.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
372
+ "vision_tower.trunk.blocks.10.mlp.fc3.weight": "model-00001-of-00002.safetensors",
373
+ "vision_tower.trunk.blocks.10.norm_1.weight": "model-00001-of-00002.safetensors",
374
+ "vision_tower.trunk.blocks.10.norm_2.weight": "model-00001-of-00002.safetensors",
375
+ "vision_tower.trunk.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
376
+ "vision_tower.trunk.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
377
+ "vision_tower.trunk.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
378
+ "vision_tower.trunk.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
379
+ "vision_tower.trunk.blocks.11.mlp.fc3.weight": "model-00001-of-00002.safetensors",
380
+ "vision_tower.trunk.blocks.11.norm_1.weight": "model-00001-of-00002.safetensors",
381
+ "vision_tower.trunk.blocks.11.norm_2.weight": "model-00001-of-00002.safetensors",
382
+ "vision_tower.trunk.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
383
+ "vision_tower.trunk.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
384
+ "vision_tower.trunk.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
385
+ "vision_tower.trunk.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
386
+ "vision_tower.trunk.blocks.12.mlp.fc3.weight": "model-00001-of-00002.safetensors",
387
+ "vision_tower.trunk.blocks.12.norm_1.weight": "model-00001-of-00002.safetensors",
388
+ "vision_tower.trunk.blocks.12.norm_2.weight": "model-00001-of-00002.safetensors",
389
+ "vision_tower.trunk.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
390
+ "vision_tower.trunk.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
391
+ "vision_tower.trunk.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
392
+ "vision_tower.trunk.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
393
+ "vision_tower.trunk.blocks.13.mlp.fc3.weight": "model-00001-of-00002.safetensors",
394
+ "vision_tower.trunk.blocks.13.norm_1.weight": "model-00001-of-00002.safetensors",
395
+ "vision_tower.trunk.blocks.13.norm_2.weight": "model-00001-of-00002.safetensors",
396
+ "vision_tower.trunk.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
397
+ "vision_tower.trunk.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
398
+ "vision_tower.trunk.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
399
+ "vision_tower.trunk.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
400
+ "vision_tower.trunk.blocks.14.mlp.fc3.weight": "model-00001-of-00002.safetensors",
401
+ "vision_tower.trunk.blocks.14.norm_1.weight": "model-00001-of-00002.safetensors",
402
+ "vision_tower.trunk.blocks.14.norm_2.weight": "model-00001-of-00002.safetensors",
403
+ "vision_tower.trunk.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
404
+ "vision_tower.trunk.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
405
+ "vision_tower.trunk.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
406
+ "vision_tower.trunk.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
407
+ "vision_tower.trunk.blocks.15.mlp.fc3.weight": "model-00001-of-00002.safetensors",
408
+ "vision_tower.trunk.blocks.15.norm_1.weight": "model-00001-of-00002.safetensors",
409
+ "vision_tower.trunk.blocks.15.norm_2.weight": "model-00001-of-00002.safetensors",
410
+ "vision_tower.trunk.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
411
+ "vision_tower.trunk.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
412
+ "vision_tower.trunk.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
413
+ "vision_tower.trunk.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
414
+ "vision_tower.trunk.blocks.16.mlp.fc3.weight": "model-00001-of-00002.safetensors",
415
+ "vision_tower.trunk.blocks.16.norm_1.weight": "model-00001-of-00002.safetensors",
416
+ "vision_tower.trunk.blocks.16.norm_2.weight": "model-00001-of-00002.safetensors",
417
+ "vision_tower.trunk.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
418
+ "vision_tower.trunk.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
419
+ "vision_tower.trunk.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
420
+ "vision_tower.trunk.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
421
+ "vision_tower.trunk.blocks.17.mlp.fc3.weight": "model-00001-of-00002.safetensors",
422
+ "vision_tower.trunk.blocks.17.norm_1.weight": "model-00001-of-00002.safetensors",
423
+ "vision_tower.trunk.blocks.17.norm_2.weight": "model-00001-of-00002.safetensors",
424
+ "vision_tower.trunk.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
425
+ "vision_tower.trunk.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
426
+ "vision_tower.trunk.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
427
+ "vision_tower.trunk.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
428
+ "vision_tower.trunk.blocks.18.mlp.fc3.weight": "model-00001-of-00002.safetensors",
429
+ "vision_tower.trunk.blocks.18.norm_1.weight": "model-00001-of-00002.safetensors",
430
+ "vision_tower.trunk.blocks.18.norm_2.weight": "model-00001-of-00002.safetensors",
431
+ "vision_tower.trunk.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
432
+ "vision_tower.trunk.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
433
+ "vision_tower.trunk.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
434
+ "vision_tower.trunk.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
435
+ "vision_tower.trunk.blocks.19.mlp.fc3.weight": "model-00001-of-00002.safetensors",
436
+ "vision_tower.trunk.blocks.19.norm_1.weight": "model-00001-of-00002.safetensors",
437
+ "vision_tower.trunk.blocks.19.norm_2.weight": "model-00001-of-00002.safetensors",
438
+ "vision_tower.trunk.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
439
+ "vision_tower.trunk.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
440
+ "vision_tower.trunk.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
441
+ "vision_tower.trunk.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
442
+ "vision_tower.trunk.blocks.2.mlp.fc3.weight": "model-00001-of-00002.safetensors",
443
+ "vision_tower.trunk.blocks.2.norm_1.weight": "model-00001-of-00002.safetensors",
444
+ "vision_tower.trunk.blocks.2.norm_2.weight": "model-00001-of-00002.safetensors",
445
+ "vision_tower.trunk.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
446
+ "vision_tower.trunk.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
447
+ "vision_tower.trunk.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
448
+ "vision_tower.trunk.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
449
+ "vision_tower.trunk.blocks.20.mlp.fc3.weight": "model-00001-of-00002.safetensors",
450
+ "vision_tower.trunk.blocks.20.norm_1.weight": "model-00001-of-00002.safetensors",
451
+ "vision_tower.trunk.blocks.20.norm_2.weight": "model-00001-of-00002.safetensors",
452
+ "vision_tower.trunk.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
453
+ "vision_tower.trunk.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
454
+ "vision_tower.trunk.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
455
+ "vision_tower.trunk.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
456
+ "vision_tower.trunk.blocks.21.mlp.fc3.weight": "model-00001-of-00002.safetensors",
457
+ "vision_tower.trunk.blocks.21.norm_1.weight": "model-00001-of-00002.safetensors",
458
+ "vision_tower.trunk.blocks.21.norm_2.weight": "model-00001-of-00002.safetensors",
459
+ "vision_tower.trunk.blocks.22.attn.proj.weight": "model-00002-of-00002.safetensors",
460
+ "vision_tower.trunk.blocks.22.attn.qkv.weight": "model-00002-of-00002.safetensors",
461
+ "vision_tower.trunk.blocks.22.mlp.fc1.weight": "model-00002-of-00002.safetensors",
462
+ "vision_tower.trunk.blocks.22.mlp.fc2.weight": "model-00002-of-00002.safetensors",
463
+ "vision_tower.trunk.blocks.22.mlp.fc3.weight": "model-00002-of-00002.safetensors",
464
+ "vision_tower.trunk.blocks.22.norm_1.weight": "model-00002-of-00002.safetensors",
465
+ "vision_tower.trunk.blocks.22.norm_2.weight": "model-00002-of-00002.safetensors",
466
+ "vision_tower.trunk.blocks.23.attn.proj.weight": "model-00002-of-00002.safetensors",
467
+ "vision_tower.trunk.blocks.23.attn.qkv.weight": "model-00002-of-00002.safetensors",
468
+ "vision_tower.trunk.blocks.23.mlp.fc1.weight": "model-00002-of-00002.safetensors",
469
+ "vision_tower.trunk.blocks.23.mlp.fc2.weight": "model-00002-of-00002.safetensors",
470
+ "vision_tower.trunk.blocks.23.mlp.fc3.weight": "model-00002-of-00002.safetensors",
471
+ "vision_tower.trunk.blocks.23.norm_1.weight": "model-00002-of-00002.safetensors",
472
+ "vision_tower.trunk.blocks.23.norm_2.weight": "model-00002-of-00002.safetensors",
473
+ "vision_tower.trunk.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
474
+ "vision_tower.trunk.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
475
+ "vision_tower.trunk.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
476
+ "vision_tower.trunk.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
477
+ "vision_tower.trunk.blocks.3.mlp.fc3.weight": "model-00001-of-00002.safetensors",
478
+ "vision_tower.trunk.blocks.3.norm_1.weight": "model-00001-of-00002.safetensors",
479
+ "vision_tower.trunk.blocks.3.norm_2.weight": "model-00001-of-00002.safetensors",
480
+ "vision_tower.trunk.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
481
+ "vision_tower.trunk.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
482
+ "vision_tower.trunk.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
483
+ "vision_tower.trunk.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
484
+ "vision_tower.trunk.blocks.4.mlp.fc3.weight": "model-00001-of-00002.safetensors",
485
+ "vision_tower.trunk.blocks.4.norm_1.weight": "model-00001-of-00002.safetensors",
486
+ "vision_tower.trunk.blocks.4.norm_2.weight": "model-00001-of-00002.safetensors",
487
+ "vision_tower.trunk.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
488
+ "vision_tower.trunk.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
489
+ "vision_tower.trunk.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
490
+ "vision_tower.trunk.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
491
+ "vision_tower.trunk.blocks.5.mlp.fc3.weight": "model-00001-of-00002.safetensors",
492
+ "vision_tower.trunk.blocks.5.norm_1.weight": "model-00001-of-00002.safetensors",
493
+ "vision_tower.trunk.blocks.5.norm_2.weight": "model-00001-of-00002.safetensors",
494
+ "vision_tower.trunk.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
495
+ "vision_tower.trunk.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
496
+ "vision_tower.trunk.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
497
+ "vision_tower.trunk.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
498
+ "vision_tower.trunk.blocks.6.mlp.fc3.weight": "model-00001-of-00002.safetensors",
499
+ "vision_tower.trunk.blocks.6.norm_1.weight": "model-00001-of-00002.safetensors",
500
+ "vision_tower.trunk.blocks.6.norm_2.weight": "model-00001-of-00002.safetensors",
501
+ "vision_tower.trunk.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
502
+ "vision_tower.trunk.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
503
+ "vision_tower.trunk.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
504
+ "vision_tower.trunk.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
505
+ "vision_tower.trunk.blocks.7.mlp.fc3.weight": "model-00001-of-00002.safetensors",
506
+ "vision_tower.trunk.blocks.7.norm_1.weight": "model-00001-of-00002.safetensors",
507
+ "vision_tower.trunk.blocks.7.norm_2.weight": "model-00001-of-00002.safetensors",
508
+ "vision_tower.trunk.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
509
+ "vision_tower.trunk.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
510
+ "vision_tower.trunk.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
511
+ "vision_tower.trunk.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
512
+ "vision_tower.trunk.blocks.8.mlp.fc3.weight": "model-00001-of-00002.safetensors",
513
+ "vision_tower.trunk.blocks.8.norm_1.weight": "model-00001-of-00002.safetensors",
514
+ "vision_tower.trunk.blocks.8.norm_2.weight": "model-00001-of-00002.safetensors",
515
+ "vision_tower.trunk.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
516
+ "vision_tower.trunk.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
517
+ "vision_tower.trunk.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
518
+ "vision_tower.trunk.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
519
+ "vision_tower.trunk.blocks.9.mlp.fc3.weight": "model-00001-of-00002.safetensors",
520
+ "vision_tower.trunk.blocks.9.norm_1.weight": "model-00001-of-00002.safetensors",
521
+ "vision_tower.trunk.blocks.9.norm_2.weight": "model-00001-of-00002.safetensors",
522
+ "vision_tower.trunk.post_trunk_norm.weight": "model-00002-of-00002.safetensors"
523
+ }
524
+ }
modeling_aimv2.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple, Union
2
+
3
+ import torch
4
+ from .configuration_aimv2 import AIMv2Config
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+ from transformers.modeling_outputs import BaseModelOutputWithNoAttention
8
+ from transformers.modeling_utils import PreTrainedModel
9
+
10
+ __all__ = ["AIMv2Model"]
11
+
12
+
13
+ class RMSNorm(nn.Module):
14
+ def __init__(self, dim: int, eps: float = 1e-6):
15
+ super().__init__()
16
+ self.weight = nn.Parameter(torch.ones(dim))
17
+ self.eps = eps
18
+
19
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
20
+ output = self._norm(x.float()).type_as(x)
21
+ return output * self.weight
22
+
23
+ def extra_repr(self) -> str:
24
+ return f"{tuple(self.weight.shape)}, eps={self.eps}"
25
+
26
+ def _norm(self, x: torch.Tensor) -> torch.Tensor:
27
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
28
+
29
+
30
+ class AIMv2SwiGLUFFN(nn.Module):
31
+ def __init__(self, config: AIMv2Config):
32
+ super().__init__()
33
+ hidden_features = config.intermediate_size
34
+ in_features = config.hidden_size
35
+ bias = config.use_bias
36
+
37
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
38
+ self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
39
+ self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
40
+
41
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
42
+ x = F.silu(self.fc1(x)) * self.fc3(x)
43
+ x = self.fc2(x)
44
+ return x
45
+
46
+
47
+ class AIMv2PatchEmbed(nn.Module):
48
+ def __init__(self, config: AIMv2Config):
49
+ super().__init__()
50
+ self.proj = nn.Conv2d(
51
+ config.num_channels,
52
+ config.hidden_size,
53
+ kernel_size=(config.patch_size, config.patch_size),
54
+ stride=(config.patch_size, config.patch_size),
55
+ )
56
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
57
+
58
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
59
+ x = self.proj(x).flatten(2).transpose(1, 2)
60
+ x = self.norm(x)
61
+ return x
62
+
63
+
64
+ class AIMv2ViTPreprocessor(nn.Module):
65
+ def __init__(self, config: AIMv2Config):
66
+ super().__init__()
67
+ num_patches = (config.image_size // config.patch_size) ** 2
68
+
69
+ self.patchifier = AIMv2PatchEmbed(config)
70
+ self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
71
+
72
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
73
+ tokens = self.patchifier(x)
74
+ _, N, _ = tokens.shape
75
+ pos_embed = self.pos_embed.to(tokens.device)
76
+ tokens = tokens + pos_embed[:, :N]
77
+ return tokens
78
+
79
+
80
+ class AIMv2Attention(nn.Module):
81
+ def __init__(self, config: AIMv2Config):
82
+ super().__init__()
83
+ dim = config.hidden_size
84
+
85
+ self.num_heads = config.num_attention_heads
86
+ self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
87
+ self.attn_drop = nn.Dropout(config.attention_dropout)
88
+ self.proj = nn.Linear(dim, dim, bias=config.use_bias)
89
+ self.proj_drop = nn.Dropout(config.projection_dropout)
90
+
91
+ def forward(
92
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
93
+ ) -> torch.Tensor:
94
+ B, N, C = x.shape
95
+ qkv = (
96
+ self.qkv(x)
97
+ .reshape(B, N, 3, self.num_heads, C // self.num_heads)
98
+ .permute(2, 0, 3, 1, 4)
99
+ )
100
+ q, k, v = qkv.unbind(0)
101
+
102
+ x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
103
+ x = x.transpose(1, 2).contiguous().reshape(B, N, C)
104
+ x = self.proj(x)
105
+ x = self.proj_drop(x)
106
+ return x
107
+
108
+
109
+ class AIMv2Block(nn.Module):
110
+ def __init__(self, config: AIMv2Config):
111
+ super().__init__()
112
+ self.attn = AIMv2Attention(config)
113
+ self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
114
+ self.mlp = AIMv2SwiGLUFFN(config)
115
+ self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
116
+
117
+ def forward(
118
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
119
+ ) -> torch.Tensor:
120
+ x = x + self.attn(self.norm_1(x), mask)
121
+ x = x + self.mlp(self.norm_2(x))
122
+ return x
123
+
124
+
125
+ class AIMv2Transformer(nn.Module):
126
+ def __init__(self, config: AIMv2Config):
127
+ super().__init__()
128
+ self.blocks = nn.ModuleList(
129
+ [AIMv2Block(config) for _ in range(config.num_hidden_layers)]
130
+ )
131
+ self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
132
+
133
+ def forward(
134
+ self,
135
+ tokens: torch.Tensor,
136
+ mask: Optional[torch.Tensor] = None,
137
+ output_hidden_states: bool = False,
138
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
139
+ hidden_states = () if output_hidden_states else None
140
+ for block in self.blocks:
141
+ tokens = block(tokens, mask)
142
+ if output_hidden_states:
143
+ hidden_states += (tokens,)
144
+ tokens = self.post_trunk_norm(tokens)
145
+ return tokens, hidden_states
146
+
147
+
148
+ class AIMv2PretrainedModel(PreTrainedModel):
149
+ config_class = AIMv2Config
150
+ base_model_prefix = "aimv2"
151
+ main_input_name = "pixel_values"
152
+ _supports_sdpa = True
153
+
154
+
155
+ class AIMv2Model(AIMv2PretrainedModel):
156
+ def __init__(self, config: AIMv2Config):
157
+ super().__init__(config)
158
+ self.preprocessor = AIMv2ViTPreprocessor(config)
159
+ self.trunk = AIMv2Transformer(config)
160
+
161
+ def forward(
162
+ self,
163
+ pixel_values: torch.Tensor,
164
+ mask: Optional[torch.Tensor] = None,
165
+ output_hidden_states: Optional[bool] = None,
166
+ return_dict: Optional[bool] = None,
167
+ ) -> Union[
168
+ Tuple[torch.Tensor],
169
+ Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
170
+ BaseModelOutputWithNoAttention,
171
+ ]:
172
+ if output_hidden_states is None:
173
+ output_hidden_states = self.config.output_hidden_states
174
+ if return_dict is None:
175
+ return_dict = self.config.use_return_dict
176
+
177
+ x = self.preprocessor(pixel_values)
178
+ x, hidden_states = self.trunk(
179
+ x, mask, output_hidden_states=output_hidden_states
180
+ )
181
+
182
+ if not return_dict:
183
+ res = (x,)
184
+ res += (hidden_states,) if output_hidden_states else ()
185
+ return res
186
+
187
+ return BaseModelOutputWithNoAttention(
188
+ last_hidden_state=x,
189
+ hidden_states=hidden_states,
190
+ )
191
+
192
+
193
+
194
+ from functools import partial
195
+ from torch import nn
196
+ import torch.nn.functional as F
197
+ from transformers.activations import ACT2FN
198
+ import math
199
+ import torch
200
+
201
+
202
+ class GLU(nn.Module):
203
+ def __init__(self, hidden_size, ffn_hidden_size, in_features):
204
+ super().__init__()
205
+ self.linear_proj = nn.Linear(in_features, hidden_size, bias=False)
206
+ self.norm1 = nn.LayerNorm(hidden_size)
207
+ self.act1 = nn.GELU()
208
+ self.act2 = nn.functional.silu
209
+ self.dense_h_to_4h = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
210
+ self.gate_proj = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
211
+ self.dense_4h_to_h = nn.Linear(ffn_hidden_size, hidden_size, bias=False)
212
+
213
+ def forward(self, x):
214
+ x = self.linear_proj(x)
215
+ x = self.act1(self.norm1(x))
216
+ x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
217
+ x = self.dense_4h_to_h(x)
218
+ return x
219
+
220
+
221
+ class MlpGLU(nn.Module):
222
+ def __init__(self, in_hidden_size, out_hidden_size):
223
+ super(MlpGLU, self).__init__()
224
+
225
+ ffn_hidden_size = out_hidden_size * 4 # out_hidden_size * 4 3584 * 4 = 14336
226
+ self.linear_proj = GLU(
227
+ hidden_size=out_hidden_size,
228
+ ffn_hidden_size=ffn_hidden_size,
229
+ in_features=in_hidden_size,
230
+ )
231
+
232
+ def forward(self, x, attention_mask: torch.Tensor = None):
233
+ x = self.linear_proj(x)
234
+ return x
235
+
236
+
237
+ class PixelShuffleLayer(nn.Module):
238
+
239
+ def __init(self):
240
+ super(PixelShuffleLayer, self).__init__()
241
+
242
+ def forward(self, x, scale_factor=0.5):
243
+ # print(f'in pixelshuffle: {x.shape}')
244
+ n, w, h, c = x.size()
245
+ # N, W, H, C --> N, W, H * scale, C // scale
246
+ x = x.reshape(n, w, int(h * scale_factor), int(c / scale_factor))
247
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
248
+ x = x.permute(0, 2, 1, 3).contiguous()
249
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
250
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
251
+ int(c / (scale_factor * scale_factor)))
252
+ x = x.permute(0, 2, 1, 3).contiguous()
253
+ return x
254
+
255
+
256
+
257
+ class PixelShuffleConnector(nn.Module):
258
+
259
+ def __init__(self, in_hidden_size, out_hidden_size, down_rate=2):
260
+ super(PixelShuffleConnector, self).__init__()
261
+ # ffn_hidden_size = 13696
262
+ ffn_hidden_size = out_hidden_size * 4 # out_hidden_size * 4 3584 * 4 = 14336
263
+ self.linear_proj = GLU(
264
+ hidden_size=out_hidden_size,
265
+ ffn_hidden_size=ffn_hidden_size,
266
+ in_features=in_hidden_size * 4,
267
+ )
268
+ self.down_rate = down_rate
269
+ if self.down_rate == 2:
270
+ down = PixelShuffleLayer()
271
+ self.downsample = nn.Sequential(*[down])
272
+ else:
273
+ print(f"unsupported downsample rate for now!")
274
+ self.scaling_factor = 8
275
+
276
+
277
+ def forward(self, x, attention_mask: torch.Tensor = None):
278
+ # print(f'xin: {x.shape}')
279
+ b, s, h = x.shape
280
+ grid_size = int(s**0.5)
281
+ x = x.reshape(b, grid_size, grid_size, h)
282
+ x = self.downsample(x)
283
+ # print(f'x: {x.shape}')
284
+ # [11, 16, 16, 4608]
285
+ x = x.reshape(x.shape[0], -1, x.shape[-1])
286
+ x = self.linear_proj(x)
287
+ return x
modeling_mono.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, Tuple, Union
3
+ import torch
4
+ from torch.nn import CrossEntropyLoss
5
+
6
+ from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM, PreTrainedModel
7
+ from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
8
+ from torch import nn
9
+ import torch.nn.functional as F
10
+ from .configuration_aimv2 import MonoConfig
11
+ from .modeling_aimv2 import AIMv2Model, PixelShuffleConnector
12
+ from transformers.generation import GenerationMixin
13
+
14
+ """
15
+
16
+ Simple arch of Mono, used for pretrain vision encoder.
17
+
18
+ """
19
+
20
+
21
+ @dataclass
22
+ class MonoCausalLMOutputWithPast(ModelOutput):
23
+
24
+ loss: Optional[torch.FloatTensor] = None
25
+ logits: torch.FloatTensor = None
26
+ past_key_values: Optional[List[torch.FloatTensor]] = None
27
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
28
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
29
+ image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
30
+
31
+
32
+ class MonoPretrainedModel(PreTrainedModel):
33
+ config_class = MonoConfig
34
+ base_model_prefix = "mono"
35
+ # main_input_name = "pixel_values"
36
+ _supports_sdpa = True
37
+ _supports_flash_attn_2 = True
38
+ _supports_cache_class = True
39
+ supports_gradient_checkpointing = True
40
+
41
+
42
+ # class MonoForConditionalGeneration(MonoPretrainedModel, Qwen2ForCausalLM):
43
+ class MonoForConditionalGeneration(MonoPretrainedModel, GenerationMixin):
44
+ _tied_weights_keys = ["lm_head.weight"]
45
+
46
+ def __init__(self, config: MonoConfig):
47
+ # super().__init__(config)
48
+ MonoPretrainedModel.__init__(self, config)
49
+ # super(Qwen2ForCausalLM, self).__init__(config)
50
+
51
+ self.vision_tower = AIMv2Model(config=config.vision_config)
52
+ self._attn_implementation = config._attn_implementation
53
+
54
+ self._build_image_projection_layers(config)
55
+
56
+ self.model = Qwen2Model(config)
57
+ self.vocab_size = config.vocab_size
58
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
59
+
60
+ self.pad_token_id = config.pad_token_id
61
+ print(f"==> pad_token_id: {self.pad_token_id}")
62
+ self.post_init()
63
+
64
+ def _build_image_projection_layers(self, config):
65
+ image_dim_out = config.vision_config.hidden_size
66
+ dim_projection = config.hidden_size
67
+ # self.mm_projector = nn.Linear(image_dim_out, dim_projection)
68
+ self.mm_projector = PixelShuffleConnector(image_dim_out, dim_projection)
69
+ print(f"==> build mm_projector: {image_dim_out} -> {dim_projection}")
70
+
71
+ def get_vision_tower(self):
72
+ return self.vision_tower
73
+
74
+ def get_input_embeddings(self):
75
+ return self.model.get_input_embeddings()
76
+
77
+ def resize_token_embeddings(
78
+ self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None
79
+ ) -> nn.Embedding:
80
+ model_embeds = self.model.resize_token_embeddings(
81
+ new_num_tokens, pad_to_multiple_of
82
+ )
83
+ # update vocab size
84
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
85
+ self.config.vocab_size = model_embeds.num_embeddings
86
+ self.vocab_size = model_embeds.num_embeddings
87
+ return model_embeds
88
+
89
+ def _encode_image(self, pixel_values):
90
+ # print(f"pixel_values: {pixel_values}")
91
+ batch_size, C, H, W = pixel_values.shape
92
+ x = self.vision_tower(pixel_values, output_hidden_states=True)
93
+ x = x[-2]
94
+ # print(x)
95
+ x = self.mm_projector(x)
96
+ # print(f"image features: {x}")
97
+ return x
98
+
99
+ def forward(
100
+ self,
101
+ input_ids: Optional[torch.LongTensor] = None,
102
+ pixel_values: torch.FloatTensor = None,
103
+ attention_mask: Optional[torch.FloatTensor] = None,
104
+ position_ids: Optional[torch.LongTensor] = None,
105
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
106
+ inputs_embeds: Optional[torch.FloatTensor] = None,
107
+ labels: Optional[torch.LongTensor] = None,
108
+ use_cache: Optional[bool] = None,
109
+ output_attentions: Optional[bool] = None,
110
+ output_hidden_states: Optional[bool] = None,
111
+ return_dict: Optional[bool] = None,
112
+ cache_position=None,
113
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
114
+ output_attentions = (
115
+ output_attentions
116
+ if output_attentions is not None
117
+ else self.config.output_attentions
118
+ )
119
+ output_hidden_states = (
120
+ output_hidden_states
121
+ if output_hidden_states is not None
122
+ else self.config.output_hidden_states
123
+ )
124
+ return_dict = (
125
+ return_dict if return_dict is not None else self.config.use_return_dict
126
+ )
127
+
128
+ image_features = None
129
+ if inputs_embeds is None:
130
+ if pixel_values is not None:
131
+ # (batch_size, num_image_tokens, hidden_size)
132
+ image_features = self._encode_image(pixel_values)
133
+
134
+ if input_ids is not None:
135
+ inputs_embeds, attention_mask, labels = (
136
+ self._get_input_embeds_with_image(input_ids, image_features, labels)
137
+ )
138
+
139
+ # print(f'before inputs_embeds: {inputs_embeds.shape}')
140
+ # print(f'before labels: {labels.shape}')
141
+
142
+ # padding all to normal sequence length only train
143
+ # if labels is not None:
144
+ # input_length = inputs_embeds.shape[1]
145
+ # label_length = labels.shape[1]
146
+
147
+ # if labels is not None:
148
+ # labels = F.pad(labels, (input_length, 0), value=-100)
149
+
150
+ # if inputs_embeds is not None:
151
+ # # append embeds and attn_mask to labels length
152
+ # padding = torch.zeros(
153
+ # inputs_embeds.shape[0],
154
+ # label_length,
155
+ # inputs_embeds.shape[2],
156
+ # dtype=inputs_embeds.dtype,
157
+ # device=inputs_embeds.device,
158
+ # )
159
+ # inputs_embeds = torch.cat([inputs_embeds, padding], dim=1)
160
+ # attention_mask = attention_mask.to(inputs_embeds.dtype)
161
+ # attention_mask = F.pad(attention_mask, (0, label_length), value=0)
162
+
163
+ # if position_ids is None:
164
+ # position_ids = torch.arange(
165
+ # input_length + label_length, device=inputs_embeds.device
166
+ # )
167
+ # position_ids = position_ids.unsqueeze(0).expand(
168
+ # inputs_embeds.shape[0], -1
169
+ # )
170
+ # position_ids[input_length:] = 0
171
+
172
+ # print(f"position_ids {position_ids}")
173
+ # print(f"labels {labels.shape}")
174
+ # print(f"labels {labels}")
175
+ # print(f"inputs_embeds {inputs_embeds.shape}")
176
+ # print(f"inputs_embeds {inputs_embeds}")
177
+ # print(f"attention_mask {attention_mask.shape}")
178
+ # print(f"attention_mask {attention_mask}")
179
+
180
+ outputs = self.model(
181
+ input_ids=None,
182
+ attention_mask=attention_mask,
183
+ position_ids=position_ids,
184
+ past_key_values=past_key_values,
185
+ inputs_embeds=inputs_embeds,
186
+ use_cache=use_cache,
187
+ output_attentions=output_attentions,
188
+ output_hidden_states=output_hidden_states,
189
+ return_dict=return_dict,
190
+ )
191
+
192
+ hidden_states = outputs[0]
193
+ logits = self.lm_head(hidden_states)
194
+
195
+ loss = None
196
+ if labels is not None:
197
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
198
+ logits = logits.float()
199
+ labels = labels.to(logits.device)
200
+ # Shift so that tokens < n predict n
201
+ if attention_mask is not None:
202
+ # we use the input attention mask to shift the logits and labels, because it is 2D.
203
+ # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
204
+ shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
205
+ logits.device
206
+ )
207
+ shift_logits = logits[..., :-1, :][
208
+ shift_attention_mask != 0
209
+ ].contiguous()
210
+ # print(f"shift_logits: {shift_logits.shape}")
211
+ shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
212
+ # print(f"shift_labels: {shift_labels.shape}")
213
+ else:
214
+ shift_logits = logits[..., :-1, :].contiguous()
215
+ shift_labels = labels[..., 1:].contiguous()
216
+ # Flatten the tokens
217
+ loss_fct = CrossEntropyLoss()
218
+ loss = loss_fct(
219
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
220
+ )
221
+
222
+ if not return_dict:
223
+ output = (logits,) + outputs[1:]
224
+ return (loss,) + output if loss is not None else output
225
+
226
+ return MonoCausalLMOutputWithPast(
227
+ loss=loss,
228
+ logits=logits,
229
+ past_key_values=outputs.past_key_values,
230
+ hidden_states=outputs.hidden_states,
231
+ attentions=outputs.attentions,
232
+ )
233
+
234
+ def _get_input_embeds_with_image(self, input_ids, image_features, labels=None):
235
+ # 1. replace image token with features; 2. replace -100 in input_ids into zeroes
236
+ # 3. handling right attention_mask
237
+ # not complicated, you can understand.
238
+ batch_size = input_ids.size(0)
239
+ processed_embeds = []
240
+ processed_masks = []
241
+ labels_ignored_im = []
242
+
243
+ max_seq_len = 0
244
+ for idx in range(batch_size):
245
+ seq = input_ids[idx]
246
+ im_pos = (seq == -200).nonzero(as_tuple=True)[0]
247
+
248
+ if im_pos.numel() > 0:
249
+ im_pos = im_pos.item()
250
+ before = seq[:im_pos]
251
+ after = seq[im_pos + 1 :]
252
+ # Exclude -100 tokens (maybe, input_ids padding with -100 intentionly)
253
+ before = before[before != -100]
254
+ after = after[after != -100]
255
+ # Get embeddings for before and after
256
+ before_embed = self.get_input_embeddings()(before)
257
+ after_embed = self.get_input_embeddings()(after)
258
+ # Concatenate before, image features, and after
259
+ seq_embed = torch.cat(
260
+ [before_embed, image_features[idx], after_embed], dim=0
261
+ )
262
+ new_seq_len = seq_embed.size(0)
263
+
264
+ # if labels not None, change image token into -100, keep image tokens length
265
+ if labels is not None:
266
+ image_token_ignore = torch.full(
267
+ (image_features[idx].shape[0],),
268
+ -100,
269
+ dtype=torch.long,
270
+ device=labels.device,
271
+ )
272
+ labels_ignored_im.append(
273
+ torch.cat(
274
+ (
275
+ labels[idx][:im_pos],
276
+ image_token_ignore,
277
+ labels[idx][im_pos + 1 :],
278
+ ),
279
+ dim=0,
280
+ )
281
+ )
282
+
283
+ else:
284
+ # Exclude -100 tokens
285
+ valid_tokens = seq[seq != -100]
286
+ seq_embed = self.get_input_embeddings()(valid_tokens)
287
+ new_seq_len = seq_embed.size(0)
288
+
289
+ # Update the maximum sequence length
290
+ if new_seq_len > max_seq_len:
291
+ max_seq_len = new_seq_len
292
+
293
+ processed_embeds.append(seq_embed)
294
+ attn_mask = torch.ones(new_seq_len, dtype=torch.bool, device=seq.device)
295
+ processed_masks.append(attn_mask)
296
+
297
+ # rest embedding is 0, rest mask is False, just padding it
298
+ inputs_embeds = torch.nn.utils.rnn.pad_sequence(
299
+ processed_embeds, batch_first=True, padding_value=0.0
300
+ )
301
+ attn_masks = torch.nn.utils.rnn.pad_sequence(
302
+ processed_masks, batch_first=True, padding_value=0
303
+ )
304
+ if labels is not None:
305
+ labels_ignored_im = torch.stack(labels_ignored_im, dim=0)
306
+ return inputs_embeds, attn_masks, labels_ignored_im
307
+ return inputs_embeds, attn_masks, None
308
+
309
+ @torch.no_grad()
310
+ def generate(self, input_ids, pixel_values=None, **kwargs):
311
+ # print(input_ids)
312
+ # print(f"pixel_values {pixel_values}")
313
+ if pixel_values is not None:
314
+ image_features = self._encode_image(pixel_values)
315
+ # print(f"image_features {image_features}")
316
+ inputs_embeds, attention_mask, _ = self._get_input_embeds_with_image(
317
+ input_ids, image_features
318
+ )
319
+ else:
320
+ if input_ids is not None:
321
+ inputs_embeds = self.get_input_embeddings()(input_ids)
322
+ attention_mask = torch.ones(
323
+ inputs_embeds.size(0),
324
+ inputs_embeds.size(1),
325
+ dtype=torch.bool,
326
+ device=inputs_embeds.device,
327
+ )
328
+
329
+ # print(f"inputs_embeds: {inputs_embeds}")
330
+ return super().generate(
331
+ input_ids=None,
332
+ inputs_embeds=inputs_embeds,
333
+ attention_mask=attention_mask,
334
+ **kwargs,
335
+ )
336
+
337
+ def prepare_inputs_for_generation(
338
+ self,
339
+ input_ids,
340
+ past_key_values=None,
341
+ inputs_embeds=None,
342
+ attention_mask=None,
343
+ **kwargs,
344
+ ):
345
+ # cut input_ids if past_key_values is used
346
+ # if past_key_values is not None:
347
+ # past_length = past_key_values[0][0].shape[2]
348
+
349
+ # # Some generation methods already pass only the last input ID
350
+ # if input_ids.shape[1] > past_length:
351
+ # input_ids = input_ids[:, -1:]
352
+ # elif input_ids.shape[1] == 1:
353
+ # pass
354
+ # else:
355
+ # # Default to old behavior: keep only final ID
356
+ # input_ids = input_ids[:, -1:]
357
+
358
+ model_inputs = super().prepare_inputs_for_generation(
359
+ input_ids,
360
+ past_key_values=past_key_values,
361
+ inputs_embeds=inputs_embeds,
362
+ **kwargs,
363
+ )
364
+ return model_inputs
365
+
366
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
367
+ return self.model.shift_tokens_right(labels)
368
+
369
+ def _reorder_cache(self, *args, **kwargs):
370
+ return self.model._reorder_cache(*args, **kwargs)
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_mono.MonoProcessor"
4
+ },
5
+ "crop_size": {
6
+ "height": 448,
7
+ "width": 448
8
+ },
9
+ "do_center_crop": true,
10
+ "do_convert_rgb": true,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_processor_type": "CLIPImageProcessor",
20
+ "image_seq_length": 577,
21
+ "image_std": [
22
+ 0.26862954,
23
+ 0.26130258,
24
+ 0.27577711
25
+ ],
26
+ "processor_class": "MonoProcessor",
27
+ "resample": 3,
28
+ "rescale_factor": 0.00392156862745098,
29
+ "size": {
30
+ "shortest_edge": 448
31
+ }
32
+ }
processing_mono.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import List, Optional, Union
4
+ import numpy as np
5
+
6
+ import torch
7
+
8
+ from transformers.feature_extraction_utils import BatchFeature
9
+ from transformers.image_utils import ImageInput, is_valid_image
10
+ from transformers.processing_utils import ProcessorMixin
11
+ from transformers.tokenization_utils_base import (
12
+ PaddingStrategy,
13
+ PreTokenizedInput,
14
+ TextInput,
15
+ TruncationStrategy,
16
+ )
17
+ from transformers.utils import TensorType
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
24
+ def is_url(val) -> bool:
25
+ return isinstance(val, str) and val.startswith("http")
26
+
27
+
28
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
29
+ def is_image_or_image_url(elem):
30
+ return is_url(elem) or is_valid_image(elem)
31
+
32
+
33
+ def _is_str_or_image(elem):
34
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
35
+
36
+
37
+ class MonoProcessor(ProcessorMixin):
38
+
39
+ attributes = ["image_processor", "tokenizer"]
40
+ image_processor_class = "CLIPImageProcessor"
41
+ # tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
42
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
43
+
44
+ def __init__(
45
+ self,
46
+ image_processor=None,
47
+ tokenizer=None,
48
+ ):
49
+ if image_processor is None:
50
+ raise ValueError("You need to specify an `image_processor`.")
51
+ if tokenizer is None:
52
+ raise ValueError("You need to specify a `tokenizer`.")
53
+
54
+ tokens_to_add = {
55
+ "additional_special_tokens": tokenizer.additional_special_tokens
56
+ + ["<od>", "</od>", "<ocr>", "</ocr>"]
57
+ + [f"<loc_{x}>" for x in range(1000)]
58
+ + [
59
+ "<cap>",
60
+ "</cap>",
61
+ "<ncap>",
62
+ "</ncap>",
63
+ "<dcap>",
64
+ "</dcap>",
65
+ "<grounding>",
66
+ "</grounding>",
67
+ "<seg>",
68
+ "</seg>",
69
+ "<sep>",
70
+ "<region_cap>",
71
+ "</region_cap>",
72
+ "<region_to_desciption>",
73
+ "</region_to_desciption>",
74
+ "<proposal>",
75
+ "</proposal>",
76
+ "<poly>",
77
+ "</poly>",
78
+ "<and>",
79
+ ]
80
+ }
81
+ tokenizer.add_special_tokens(tokens_to_add)
82
+
83
+ self.tasks_answer_post_processing_type = {
84
+ "<OCR>": "pure_text",
85
+ "<OCR_WITH_REGION>": "ocr",
86
+ "<CAPTION>": "pure_text",
87
+ "<DETAILED_CAPTION>": "pure_text",
88
+ "<MORE_DETAILED_CAPTION>": "pure_text",
89
+ "<OD>": "description_with_bboxes",
90
+ "<DENSE_REGION_CAPTION>": "description_with_bboxes",
91
+ "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding",
92
+ "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons",
93
+ "<REGION_TO_SEGMENTATION>": "polygons",
94
+ "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons",
95
+ "<REGION_TO_CATEGORY>": "pure_text",
96
+ "<REGION_TO_DESCRIPTION>": "pure_text",
97
+ "<REGION_TO_OCR>": "pure_text",
98
+ "<REGION_PROPOSAL>": "bboxes",
99
+ }
100
+
101
+ self.task_prompts_without_inputs = {
102
+ "<OCR>": "What is the text in the image?",
103
+ "<OCR_WITH_REGION>": "What is the text in the image, with regions?",
104
+ "<CAPTION>": "What does the image describe?",
105
+ "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.",
106
+ "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.",
107
+ "<OD>": "Locate the objects with category name in the image.",
108
+ "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.",
109
+ "<REGION_PROPOSAL>": "Locate the region proposals in the image.",
110
+ }
111
+
112
+ self.task_prompts_with_input = {
113
+ "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}",
114
+ "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask",
115
+ "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}",
116
+ "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.",
117
+ "<REGION_TO_CATEGORY>": "What is the region {input}?",
118
+ "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?",
119
+ "<REGION_TO_OCR>": "What text is in the region {input}?",
120
+ }
121
+
122
+ super().__init__(image_processor, tokenizer)
123
+
124
+ def construct_prompts(self, text):
125
+ # replace the task tokens with the task prompts if task token is in the text
126
+ if isinstance(text, str):
127
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
128
+ if task_token in text:
129
+ _text = task_prompt
130
+ break
131
+ return _text
132
+ prompts = []
133
+ for _text in text:
134
+ # 1. fixed task prompts without additional inputs
135
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
136
+ if task_token in _text:
137
+ assert (
138
+ _text == task_token
139
+ ), f"Task token {task_token} should be the only token in the text."
140
+ _text = task_prompt
141
+ break
142
+ # 2. task prompts with additional inputs
143
+ for task_token, task_prompt in self.task_prompts_with_input.items():
144
+ if task_token in _text:
145
+ _text = task_prompt.format(input=_text.replace(task_token, ""))
146
+ break
147
+ prompts.append(_text)
148
+ return prompts
149
+
150
+ def __call__(
151
+ self,
152
+ text: Union[
153
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
154
+ ] = None,
155
+ images: ImageInput = None,
156
+ tokenize_newline_separately: bool = True,
157
+ padding: Union[bool, str, PaddingStrategy] = False,
158
+ truncation: Union[bool, str, TruncationStrategy] = None,
159
+ max_length=None,
160
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
161
+ do_resize: bool = None,
162
+ size=None,
163
+ do_normalize: bool = None,
164
+ image_mean: Optional[Union[float, List[float]]] = None,
165
+ image_std: Optional[Union[float, List[float]]] = None,
166
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
167
+ input_data_format: Optional[
168
+ Union[str, "ChannelDimension"] # noqa: F821
169
+ ] = None,
170
+ resample: "PILImageResampling" = None, # noqa: F821
171
+ do_convert_rgb: bool = None,
172
+ do_thumbnail: bool = None,
173
+ do_align_long_axis: bool = None,
174
+ do_rescale: bool = None,
175
+ ) -> BatchFeature:
176
+ return_token_type_ids = False
177
+
178
+ if text is None:
179
+ logger.warning_once("You are using Florence-2 without a text prompt.")
180
+ text = ""
181
+
182
+ if isinstance(text, List) and isinstance(images, List):
183
+ if len(images) < len(text):
184
+ raise ValueError(
185
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
186
+ )
187
+ if _is_str_or_image(text):
188
+ text = [text]
189
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
190
+ pass
191
+
192
+ if images is not None:
193
+ pixel_values = self.image_processor(
194
+ images,
195
+ size=size,
196
+ do_resize=do_resize,
197
+ do_normalize=do_normalize,
198
+ return_tensors=return_tensors,
199
+ image_mean=image_mean,
200
+ image_std=image_std,
201
+ input_data_format=input_data_format,
202
+ data_format=data_format,
203
+ resample=resample,
204
+ do_convert_rgb=do_convert_rgb,
205
+ )["pixel_values"]
206
+
207
+ # text = self.construct_prompts(text)
208
+
209
+ inputs = self.tokenizer(
210
+ text,
211
+ return_tensors=return_tensors,
212
+ padding=padding,
213
+ max_length=max_length,
214
+ truncation=truncation,
215
+ return_token_type_ids=return_token_type_ids,
216
+ )
217
+
218
+ if images is not None:
219
+ # print(inputs)
220
+ # add IMAGE_TOKEN
221
+ inputs_with_image = [
222
+ torch.cat((torch.tensor([-200]), b), dim=0) for b in inputs["input_ids"]
223
+ ]
224
+ # inputs["input_ids"] = torch.stack(inputs_with_image)
225
+ inputs["input_ids"] = inputs_with_image
226
+
227
+ return_data = {**inputs, "pixel_values": pixel_values}
228
+ else:
229
+ return_data = {**inputs, "pixel_values": None}
230
+
231
+ if return_token_type_ids:
232
+ labels = inputs["input_ids"].masked_fill(
233
+ inputs["token_type_ids"] == 0, -100
234
+ )
235
+ return_data.update({"labels": labels})
236
+ return BatchFeature(data=return_data)
237
+
238
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
239
+ def batch_decode(self, *args, **kwargs):
240
+ """
241
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
242
+ refer to the docstring of this method for more information.
243
+ """
244
+ return self.tokenizer.batch_decode(*args, **kwargs)
245
+
246
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
247
+ def decode(self, *args, **kwargs):
248
+ """
249
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
250
+ the docstring of this method for more information.
251
+ """
252
+ return self.tokenizer.decode(*args, **kwargs)
253
+
254
+ @property
255
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
256
+ def model_input_names(self):
257
+ tokenizer_input_names = self.tokenizer.model_input_names
258
+ image_processor_input_names = self.image_processor.model_input_names
259
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_mono.MonoProcessor"
4
+ },
5
+ "processor_class": "MonoProcessor"
6
+ }
special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff