Isaak Carter Augustus commited on
Commit
bf0fed4
1 Parent(s): 6eb8ee4

Delete josie_architecture.txt

Browse files
Files changed (1) hide show
  1. josie_architecture.txt +0 -1002
josie_architecture.txt DELETED
@@ -1,1002 +0,0 @@
1
- JOSIE(
2
- (encoder): Encoder(
3
- (modality_preprocessors): ModuleDict(
4
- (vision): RGBDTPreprocessor(
5
- (cls_token): tensor((1, 1, 768), requires_grad=False)
6
-
7
- (rgbt_stem): PatchEmbedGeneric(
8
- (proj): Sequential(
9
- (0): PadIm2Video()
10
- (1): Conv3d(3, 768, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
11
- )
12
- )
13
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
14
- (pos_embed): tensor((1, 7681, 768), requires_grad=False)
15
-
16
- )
17
- )
18
- (audio): AudioPreprocessor(
19
- (cls_token): tensor((1, 1, 768), requires_grad=False)
20
-
21
- (rgbt_stem): PatchEmbedGeneric(
22
- (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
23
- (norm_layer): RMSNorm()
24
- )
25
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
26
- (pos_embed): tensor((1, 229, 768), requires_grad=False)
27
-
28
- )
29
- )
30
- (depth): RGBDTPreprocessor(
31
- (cls_token): tensor((1, 1, 384), requires_grad=False)
32
-
33
- (depth_stem): PatchEmbedGeneric(
34
- (proj): Conv2d(1, 384, kernel_size=(16, 16), stride=(16, 16), bias=False)
35
- (norm_layer): RMSNorm()
36
- )
37
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
38
- (pos_embed): tensor((1, 197, 384), requires_grad=False)
39
-
40
- )
41
- )
42
- (thermal): ThermalPreprocessor(
43
- (cls_token): tensor((1, 1, 768), requires_grad=False)
44
-
45
- (rgbt_stem): PatchEmbedGeneric(
46
- (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
47
- (norm_layer): RMSNorm()
48
- )
49
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
50
- (pos_embed): tensor((1, 197, 768), requires_grad=False)
51
-
52
- )
53
- )
54
- )
55
- (modality_transformers): ModuleDict(
56
- (vision): EncoderTransformer(
57
- (pre_transformer_layer): Sequential(
58
- (0): RMSNorm()
59
- (1): EinOpsRearrange()
60
- )
61
- (post_transformer_layer): EinOpsRearrange()
62
- (blocks): ModuleList(
63
- (0): EncoderTransformerBlock(
64
- (attn): MultiheadAttention(
65
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
66
- )
67
- (drop_path): Identity()
68
- (norm1): RMSNorm()
69
- (norm2): RMSNorm()
70
- (mlp): MLP(
71
- (w1): Linear(in_features=768, out_features=512, bias=False)
72
- (w2): Linear(in_features=512, out_features=768, bias=False)
73
- (w3): Linear(in_features=768, out_features=512, bias=False)
74
- )
75
- )
76
- (1): EncoderTransformerBlock(
77
- (attn): MultiheadAttention(
78
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
79
- )
80
- (drop_path): Identity()
81
- (norm1): RMSNorm()
82
- (norm2): RMSNorm()
83
- (mlp): MLP(
84
- (w1): Linear(in_features=768, out_features=512, bias=False)
85
- (w2): Linear(in_features=512, out_features=768, bias=False)
86
- (w3): Linear(in_features=768, out_features=512, bias=False)
87
- )
88
- )
89
- (2): EncoderTransformerBlock(
90
- (attn): MultiheadAttention(
91
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
92
- )
93
- (drop_path): Identity()
94
- (norm1): RMSNorm()
95
- (norm2): RMSNorm()
96
- (mlp): MLP(
97
- (w1): Linear(in_features=768, out_features=512, bias=False)
98
- (w2): Linear(in_features=512, out_features=768, bias=False)
99
- (w3): Linear(in_features=768, out_features=512, bias=False)
100
- )
101
- )
102
- (3): EncoderTransformerBlock(
103
- (attn): MultiheadAttention(
104
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
105
- )
106
- (drop_path): Identity()
107
- (norm1): RMSNorm()
108
- (norm2): RMSNorm()
109
- (mlp): MLP(
110
- (w1): Linear(in_features=768, out_features=512, bias=False)
111
- (w2): Linear(in_features=512, out_features=768, bias=False)
112
- (w3): Linear(in_features=768, out_features=512, bias=False)
113
- )
114
- )
115
- (4): EncoderTransformerBlock(
116
- (attn): MultiheadAttention(
117
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
118
- )
119
- (drop_path): Identity()
120
- (norm1): RMSNorm()
121
- (norm2): RMSNorm()
122
- (mlp): MLP(
123
- (w1): Linear(in_features=768, out_features=512, bias=False)
124
- (w2): Linear(in_features=512, out_features=768, bias=False)
125
- (w3): Linear(in_features=768, out_features=512, bias=False)
126
- )
127
- )
128
- (5): EncoderTransformerBlock(
129
- (attn): MultiheadAttention(
130
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
131
- )
132
- (drop_path): Identity()
133
- (norm1): RMSNorm()
134
- (norm2): RMSNorm()
135
- (mlp): MLP(
136
- (w1): Linear(in_features=768, out_features=512, bias=False)
137
- (w2): Linear(in_features=512, out_features=768, bias=False)
138
- (w3): Linear(in_features=768, out_features=512, bias=False)
139
- )
140
- )
141
- (6): EncoderTransformerBlock(
142
- (attn): MultiheadAttention(
143
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
144
- )
145
- (drop_path): Identity()
146
- (norm1): RMSNorm()
147
- (norm2): RMSNorm()
148
- (mlp): MLP(
149
- (w1): Linear(in_features=768, out_features=512, bias=False)
150
- (w2): Linear(in_features=512, out_features=768, bias=False)
151
- (w3): Linear(in_features=768, out_features=512, bias=False)
152
- )
153
- )
154
- (7): EncoderTransformerBlock(
155
- (attn): MultiheadAttention(
156
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
157
- )
158
- (drop_path): Identity()
159
- (norm1): RMSNorm()
160
- (norm2): RMSNorm()
161
- (mlp): MLP(
162
- (w1): Linear(in_features=768, out_features=512, bias=False)
163
- (w2): Linear(in_features=512, out_features=768, bias=False)
164
- (w3): Linear(in_features=768, out_features=512, bias=False)
165
- )
166
- )
167
- (8): EncoderTransformerBlock(
168
- (attn): MultiheadAttention(
169
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
170
- )
171
- (drop_path): Identity()
172
- (norm1): RMSNorm()
173
- (norm2): RMSNorm()
174
- (mlp): MLP(
175
- (w1): Linear(in_features=768, out_features=512, bias=False)
176
- (w2): Linear(in_features=512, out_features=768, bias=False)
177
- (w3): Linear(in_features=768, out_features=512, bias=False)
178
- )
179
- )
180
- (9): EncoderTransformerBlock(
181
- (attn): MultiheadAttention(
182
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
183
- )
184
- (drop_path): Identity()
185
- (norm1): RMSNorm()
186
- (norm2): RMSNorm()
187
- (mlp): MLP(
188
- (w1): Linear(in_features=768, out_features=512, bias=False)
189
- (w2): Linear(in_features=512, out_features=768, bias=False)
190
- (w3): Linear(in_features=768, out_features=512, bias=False)
191
- )
192
- )
193
- (10): EncoderTransformerBlock(
194
- (attn): MultiheadAttention(
195
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
196
- )
197
- (drop_path): Identity()
198
- (norm1): RMSNorm()
199
- (norm2): RMSNorm()
200
- (mlp): MLP(
201
- (w1): Linear(in_features=768, out_features=512, bias=False)
202
- (w2): Linear(in_features=512, out_features=768, bias=False)
203
- (w3): Linear(in_features=768, out_features=512, bias=False)
204
- )
205
- )
206
- (11): EncoderTransformerBlock(
207
- (attn): MultiheadAttention(
208
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
209
- )
210
- (drop_path): Identity()
211
- (norm1): RMSNorm()
212
- (norm2): RMSNorm()
213
- (mlp): MLP(
214
- (w1): Linear(in_features=768, out_features=512, bias=False)
215
- (w2): Linear(in_features=512, out_features=768, bias=False)
216
- (w3): Linear(in_features=768, out_features=512, bias=False)
217
- )
218
- )
219
- )
220
- )
221
- (audio): EncoderTransformer(
222
- (pre_transformer_layer): Sequential(
223
- (0): RMSNorm()
224
- (1): EinOpsRearrange()
225
- )
226
- (post_transformer_layer): EinOpsRearrange()
227
- (blocks): ModuleList(
228
- (0): EncoderTransformerBlock(
229
- (attn): MultiheadAttention(
230
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
231
- )
232
- (drop_path): Identity()
233
- (norm1): RMSNorm()
234
- (norm2): RMSNorm()
235
- (mlp): MLP(
236
- (w1): Linear(in_features=768, out_features=512, bias=False)
237
- (w2): Linear(in_features=512, out_features=768, bias=False)
238
- (w3): Linear(in_features=768, out_features=512, bias=False)
239
- )
240
- )
241
- (1): EncoderTransformerBlock(
242
- (attn): MultiheadAttention(
243
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
244
- )
245
- (drop_path): DropPath(drop_prob=0.009)
246
- (norm1): RMSNorm()
247
- (norm2): RMSNorm()
248
- (mlp): MLP(
249
- (w1): Linear(in_features=768, out_features=512, bias=False)
250
- (w2): Linear(in_features=512, out_features=768, bias=False)
251
- (w3): Linear(in_features=768, out_features=512, bias=False)
252
- )
253
- )
254
- (2): EncoderTransformerBlock(
255
- (attn): MultiheadAttention(
256
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
257
- )
258
- (drop_path): DropPath(drop_prob=0.018)
259
- (norm1): RMSNorm()
260
- (norm2): RMSNorm()
261
- (mlp): MLP(
262
- (w1): Linear(in_features=768, out_features=512, bias=False)
263
- (w2): Linear(in_features=512, out_features=768, bias=False)
264
- (w3): Linear(in_features=768, out_features=512, bias=False)
265
- )
266
- )
267
- (3): EncoderTransformerBlock(
268
- (attn): MultiheadAttention(
269
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
270
- )
271
- (drop_path): DropPath(drop_prob=0.027)
272
- (norm1): RMSNorm()
273
- (norm2): RMSNorm()
274
- (mlp): MLP(
275
- (w1): Linear(in_features=768, out_features=512, bias=False)
276
- (w2): Linear(in_features=512, out_features=768, bias=False)
277
- (w3): Linear(in_features=768, out_features=512, bias=False)
278
- )
279
- )
280
- (4): EncoderTransformerBlock(
281
- (attn): MultiheadAttention(
282
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
283
- )
284
- (drop_path): DropPath(drop_prob=0.036)
285
- (norm1): RMSNorm()
286
- (norm2): RMSNorm()
287
- (mlp): MLP(
288
- (w1): Linear(in_features=768, out_features=512, bias=False)
289
- (w2): Linear(in_features=512, out_features=768, bias=False)
290
- (w3): Linear(in_features=768, out_features=512, bias=False)
291
- )
292
- )
293
- (5): EncoderTransformerBlock(
294
- (attn): MultiheadAttention(
295
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
296
- )
297
- (drop_path): DropPath(drop_prob=0.045)
298
- (norm1): RMSNorm()
299
- (norm2): RMSNorm()
300
- (mlp): MLP(
301
- (w1): Linear(in_features=768, out_features=512, bias=False)
302
- (w2): Linear(in_features=512, out_features=768, bias=False)
303
- (w3): Linear(in_features=768, out_features=512, bias=False)
304
- )
305
- )
306
- (6): EncoderTransformerBlock(
307
- (attn): MultiheadAttention(
308
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
309
- )
310
- (drop_path): DropPath(drop_prob=0.055)
311
- (norm1): RMSNorm()
312
- (norm2): RMSNorm()
313
- (mlp): MLP(
314
- (w1): Linear(in_features=768, out_features=512, bias=False)
315
- (w2): Linear(in_features=512, out_features=768, bias=False)
316
- (w3): Linear(in_features=768, out_features=512, bias=False)
317
- )
318
- )
319
- (7): EncoderTransformerBlock(
320
- (attn): MultiheadAttention(
321
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
322
- )
323
- (drop_path): DropPath(drop_prob=0.064)
324
- (norm1): RMSNorm()
325
- (norm2): RMSNorm()
326
- (mlp): MLP(
327
- (w1): Linear(in_features=768, out_features=512, bias=False)
328
- (w2): Linear(in_features=512, out_features=768, bias=False)
329
- (w3): Linear(in_features=768, out_features=512, bias=False)
330
- )
331
- )
332
- (8): EncoderTransformerBlock(
333
- (attn): MultiheadAttention(
334
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
335
- )
336
- (drop_path): DropPath(drop_prob=0.073)
337
- (norm1): RMSNorm()
338
- (norm2): RMSNorm()
339
- (mlp): MLP(
340
- (w1): Linear(in_features=768, out_features=512, bias=False)
341
- (w2): Linear(in_features=512, out_features=768, bias=False)
342
- (w3): Linear(in_features=768, out_features=512, bias=False)
343
- )
344
- )
345
- (9): EncoderTransformerBlock(
346
- (attn): MultiheadAttention(
347
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
348
- )
349
- (drop_path): DropPath(drop_prob=0.082)
350
- (norm1): RMSNorm()
351
- (norm2): RMSNorm()
352
- (mlp): MLP(
353
- (w1): Linear(in_features=768, out_features=512, bias=False)
354
- (w2): Linear(in_features=512, out_features=768, bias=False)
355
- (w3): Linear(in_features=768, out_features=512, bias=False)
356
- )
357
- )
358
- (10): EncoderTransformerBlock(
359
- (attn): MultiheadAttention(
360
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
361
- )
362
- (drop_path): DropPath(drop_prob=0.091)
363
- (norm1): RMSNorm()
364
- (norm2): RMSNorm()
365
- (mlp): MLP(
366
- (w1): Linear(in_features=768, out_features=512, bias=False)
367
- (w2): Linear(in_features=512, out_features=768, bias=False)
368
- (w3): Linear(in_features=768, out_features=512, bias=False)
369
- )
370
- )
371
- (11): EncoderTransformerBlock(
372
- (attn): MultiheadAttention(
373
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
374
- )
375
- (drop_path): DropPath(drop_prob=0.100)
376
- (norm1): RMSNorm()
377
- (norm2): RMSNorm()
378
- (mlp): MLP(
379
- (w1): Linear(in_features=768, out_features=512, bias=False)
380
- (w2): Linear(in_features=512, out_features=768, bias=False)
381
- (w3): Linear(in_features=768, out_features=512, bias=False)
382
- )
383
- )
384
- )
385
- )
386
- (depth): EncoderTransformer(
387
- (pre_transformer_layer): Sequential(
388
- (0): RMSNorm()
389
- (1): EinOpsRearrange()
390
- )
391
- (post_transformer_layer): EinOpsRearrange()
392
- (blocks): ModuleList(
393
- (0): EncoderTransformerBlock(
394
- (attn): MultiheadAttention(
395
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
396
- )
397
- (drop_path): Identity()
398
- (norm1): RMSNorm()
399
- (norm2): RMSNorm()
400
- (mlp): MLP(
401
- (w1): Linear(in_features=384, out_features=256, bias=False)
402
- (w2): Linear(in_features=256, out_features=384, bias=False)
403
- (w3): Linear(in_features=384, out_features=256, bias=False)
404
- )
405
- )
406
- (1): EncoderTransformerBlock(
407
- (attn): MultiheadAttention(
408
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
409
- )
410
- (drop_path): Identity()
411
- (norm1): RMSNorm()
412
- (norm2): RMSNorm()
413
- (mlp): MLP(
414
- (w1): Linear(in_features=384, out_features=256, bias=False)
415
- (w2): Linear(in_features=256, out_features=384, bias=False)
416
- (w3): Linear(in_features=384, out_features=256, bias=False)
417
- )
418
- )
419
- (2): EncoderTransformerBlock(
420
- (attn): MultiheadAttention(
421
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
422
- )
423
- (drop_path): Identity()
424
- (norm1): RMSNorm()
425
- (norm2): RMSNorm()
426
- (mlp): MLP(
427
- (w1): Linear(in_features=384, out_features=256, bias=False)
428
- (w2): Linear(in_features=256, out_features=384, bias=False)
429
- (w3): Linear(in_features=384, out_features=256, bias=False)
430
- )
431
- )
432
- (3): EncoderTransformerBlock(
433
- (attn): MultiheadAttention(
434
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
435
- )
436
- (drop_path): Identity()
437
- (norm1): RMSNorm()
438
- (norm2): RMSNorm()
439
- (mlp): MLP(
440
- (w1): Linear(in_features=384, out_features=256, bias=False)
441
- (w2): Linear(in_features=256, out_features=384, bias=False)
442
- (w3): Linear(in_features=384, out_features=256, bias=False)
443
- )
444
- )
445
- (4): EncoderTransformerBlock(
446
- (attn): MultiheadAttention(
447
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
448
- )
449
- (drop_path): Identity()
450
- (norm1): RMSNorm()
451
- (norm2): RMSNorm()
452
- (mlp): MLP(
453
- (w1): Linear(in_features=384, out_features=256, bias=False)
454
- (w2): Linear(in_features=256, out_features=384, bias=False)
455
- (w3): Linear(in_features=384, out_features=256, bias=False)
456
- )
457
- )
458
- (5): EncoderTransformerBlock(
459
- (attn): MultiheadAttention(
460
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
461
- )
462
- (drop_path): Identity()
463
- (norm1): RMSNorm()
464
- (norm2): RMSNorm()
465
- (mlp): MLP(
466
- (w1): Linear(in_features=384, out_features=256, bias=False)
467
- (w2): Linear(in_features=256, out_features=384, bias=False)
468
- (w3): Linear(in_features=384, out_features=256, bias=False)
469
- )
470
- )
471
- )
472
- )
473
- (thermal): EncoderTransformer(
474
- (pre_transformer_layer): Sequential(
475
- (0): RMSNorm()
476
- (1): EinOpsRearrange()
477
- )
478
- (post_transformer_layer): EinOpsRearrange()
479
- (blocks): ModuleList(
480
- (0): EncoderTransformerBlock(
481
- (attn): MultiheadAttention(
482
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
483
- )
484
- (drop_path): Identity()
485
- (norm1): RMSNorm()
486
- (norm2): RMSNorm()
487
- (mlp): MLP(
488
- (w1): Linear(in_features=768, out_features=512, bias=False)
489
- (w2): Linear(in_features=512, out_features=768, bias=False)
490
- (w3): Linear(in_features=768, out_features=512, bias=False)
491
- )
492
- )
493
- (1): EncoderTransformerBlock(
494
- (attn): MultiheadAttention(
495
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
496
- )
497
- (drop_path): Identity()
498
- (norm1): RMSNorm()
499
- (norm2): RMSNorm()
500
- (mlp): MLP(
501
- (w1): Linear(in_features=768, out_features=512, bias=False)
502
- (w2): Linear(in_features=512, out_features=768, bias=False)
503
- (w3): Linear(in_features=768, out_features=512, bias=False)
504
- )
505
- )
506
- (2): EncoderTransformerBlock(
507
- (attn): MultiheadAttention(
508
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
509
- )
510
- (drop_path): Identity()
511
- (norm1): RMSNorm()
512
- (norm2): RMSNorm()
513
- (mlp): MLP(
514
- (w1): Linear(in_features=768, out_features=512, bias=False)
515
- (w2): Linear(in_features=512, out_features=768, bias=False)
516
- (w3): Linear(in_features=768, out_features=512, bias=False)
517
- )
518
- )
519
- (3): EncoderTransformerBlock(
520
- (attn): MultiheadAttention(
521
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
522
- )
523
- (drop_path): Identity()
524
- (norm1): RMSNorm()
525
- (norm2): RMSNorm()
526
- (mlp): MLP(
527
- (w1): Linear(in_features=768, out_features=512, bias=False)
528
- (w2): Linear(in_features=512, out_features=768, bias=False)
529
- (w3): Linear(in_features=768, out_features=512, bias=False)
530
- )
531
- )
532
- (4): EncoderTransformerBlock(
533
- (attn): MultiheadAttention(
534
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
535
- )
536
- (drop_path): Identity()
537
- (norm1): RMSNorm()
538
- (norm2): RMSNorm()
539
- (mlp): MLP(
540
- (w1): Linear(in_features=768, out_features=512, bias=False)
541
- (w2): Linear(in_features=512, out_features=768, bias=False)
542
- (w3): Linear(in_features=768, out_features=512, bias=False)
543
- )
544
- )
545
- (5): EncoderTransformerBlock(
546
- (attn): MultiheadAttention(
547
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
548
- )
549
- (drop_path): Identity()
550
- (norm1): RMSNorm()
551
- (norm2): RMSNorm()
552
- (mlp): MLP(
553
- (w1): Linear(in_features=768, out_features=512, bias=False)
554
- (w2): Linear(in_features=512, out_features=768, bias=False)
555
- (w3): Linear(in_features=768, out_features=512, bias=False)
556
- )
557
- )
558
- )
559
- )
560
- )
561
- (modality_heads): ModuleDict(
562
- (vision): Sequential(
563
- (0): RMSNorm()
564
- (1): SelectElement()
565
- (2): Linear(in_features=768, out_features=1024, bias=False)
566
- )
567
- (audio): Sequential(
568
- (0): RMSNorm()
569
- (1): SelectElement()
570
- (2): Linear(in_features=768, out_features=1024, bias=False)
571
- )
572
- (depth): Sequential(
573
- (0): RMSNorm()
574
- (1): SelectElement()
575
- (2): Linear(in_features=384, out_features=1024, bias=False)
576
- )
577
- (thermal): Sequential(
578
- (0): RMSNorm()
579
- (1): SelectElement()
580
- (2): Linear(in_features=768, out_features=1024, bias=False)
581
- )
582
- )
583
- )
584
- (reasoner): Qwen2ForCausalLM(
585
- (model): Qwen2Model(
586
- (embed_tokens): Embedding(151936, 896)
587
- (layers): ModuleList(
588
- (0): Qwen2DecoderLayer(
589
- (self_attn): Qwen2Attention(
590
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
591
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
592
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
593
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
594
- (rotary_emb): Qwen2RotaryEmbedding()
595
- )
596
- (mlp): Qwen2MLP(
597
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
598
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
599
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
600
- (act_fn): SiLU()
601
- )
602
- (input_layernorm): Qwen2RMSNorm()
603
- (post_attention_layernorm): Qwen2RMSNorm()
604
- )
605
- (1): Qwen2DecoderLayer(
606
- (self_attn): Qwen2Attention(
607
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
608
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
609
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
610
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
611
- (rotary_emb): Qwen2RotaryEmbedding()
612
- )
613
- (mlp): Qwen2MLP(
614
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
615
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
616
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
617
- (act_fn): SiLU()
618
- )
619
- (input_layernorm): Qwen2RMSNorm()
620
- (post_attention_layernorm): Qwen2RMSNorm()
621
- )
622
- (2): Qwen2DecoderLayer(
623
- (self_attn): Qwen2Attention(
624
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
625
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
626
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
627
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
628
- (rotary_emb): Qwen2RotaryEmbedding()
629
- )
630
- (mlp): Qwen2MLP(
631
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
632
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
633
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
634
- (act_fn): SiLU()
635
- )
636
- (input_layernorm): Qwen2RMSNorm()
637
- (post_attention_layernorm): Qwen2RMSNorm()
638
- )
639
- (3): Qwen2DecoderLayer(
640
- (self_attn): Qwen2Attention(
641
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
642
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
643
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
644
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
645
- (rotary_emb): Qwen2RotaryEmbedding()
646
- )
647
- (mlp): Qwen2MLP(
648
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
649
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
650
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
651
- (act_fn): SiLU()
652
- )
653
- (input_layernorm): Qwen2RMSNorm()
654
- (post_attention_layernorm): Qwen2RMSNorm()
655
- )
656
- (4): Qwen2DecoderLayer(
657
- (self_attn): Qwen2Attention(
658
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
659
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
660
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
661
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
662
- (rotary_emb): Qwen2RotaryEmbedding()
663
- )
664
- (mlp): Qwen2MLP(
665
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
666
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
667
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
668
- (act_fn): SiLU()
669
- )
670
- (input_layernorm): Qwen2RMSNorm()
671
- (post_attention_layernorm): Qwen2RMSNorm()
672
- )
673
- (5): Qwen2DecoderLayer(
674
- (self_attn): Qwen2Attention(
675
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
676
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
677
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
678
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
679
- (rotary_emb): Qwen2RotaryEmbedding()
680
- )
681
- (mlp): Qwen2MLP(
682
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
683
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
684
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
685
- (act_fn): SiLU()
686
- )
687
- (input_layernorm): Qwen2RMSNorm()
688
- (post_attention_layernorm): Qwen2RMSNorm()
689
- )
690
- (6): Qwen2DecoderLayer(
691
- (self_attn): Qwen2Attention(
692
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
693
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
694
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
695
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
696
- (rotary_emb): Qwen2RotaryEmbedding()
697
- )
698
- (mlp): Qwen2MLP(
699
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
700
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
701
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
702
- (act_fn): SiLU()
703
- )
704
- (input_layernorm): Qwen2RMSNorm()
705
- (post_attention_layernorm): Qwen2RMSNorm()
706
- )
707
- (7): Qwen2DecoderLayer(
708
- (self_attn): Qwen2Attention(
709
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
710
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
711
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
712
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
713
- (rotary_emb): Qwen2RotaryEmbedding()
714
- )
715
- (mlp): Qwen2MLP(
716
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
717
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
718
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
719
- (act_fn): SiLU()
720
- )
721
- (input_layernorm): Qwen2RMSNorm()
722
- (post_attention_layernorm): Qwen2RMSNorm()
723
- )
724
- (8): Qwen2DecoderLayer(
725
- (self_attn): Qwen2Attention(
726
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
727
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
728
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
729
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
730
- (rotary_emb): Qwen2RotaryEmbedding()
731
- )
732
- (mlp): Qwen2MLP(
733
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
734
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
735
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
736
- (act_fn): SiLU()
737
- )
738
- (input_layernorm): Qwen2RMSNorm()
739
- (post_attention_layernorm): Qwen2RMSNorm()
740
- )
741
- (9): Qwen2DecoderLayer(
742
- (self_attn): Qwen2Attention(
743
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
744
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
745
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
746
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
747
- (rotary_emb): Qwen2RotaryEmbedding()
748
- )
749
- (mlp): Qwen2MLP(
750
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
751
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
752
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
753
- (act_fn): SiLU()
754
- )
755
- (input_layernorm): Qwen2RMSNorm()
756
- (post_attention_layernorm): Qwen2RMSNorm()
757
- )
758
- (10): Qwen2DecoderLayer(
759
- (self_attn): Qwen2Attention(
760
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
761
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
762
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
763
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
764
- (rotary_emb): Qwen2RotaryEmbedding()
765
- )
766
- (mlp): Qwen2MLP(
767
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
768
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
769
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
770
- (act_fn): SiLU()
771
- )
772
- (input_layernorm): Qwen2RMSNorm()
773
- (post_attention_layernorm): Qwen2RMSNorm()
774
- )
775
- (11): Qwen2DecoderLayer(
776
- (self_attn): Qwen2Attention(
777
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
778
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
779
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
780
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
781
- (rotary_emb): Qwen2RotaryEmbedding()
782
- )
783
- (mlp): Qwen2MLP(
784
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
785
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
786
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
787
- (act_fn): SiLU()
788
- )
789
- (input_layernorm): Qwen2RMSNorm()
790
- (post_attention_layernorm): Qwen2RMSNorm()
791
- )
792
- (12): Qwen2DecoderLayer(
793
- (self_attn): Qwen2Attention(
794
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
795
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
796
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
797
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
798
- (rotary_emb): Qwen2RotaryEmbedding()
799
- )
800
- (mlp): Qwen2MLP(
801
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
802
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
803
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
804
- (act_fn): SiLU()
805
- )
806
- (input_layernorm): Qwen2RMSNorm()
807
- (post_attention_layernorm): Qwen2RMSNorm()
808
- )
809
- (13): Qwen2DecoderLayer(
810
- (self_attn): Qwen2Attention(
811
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
812
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
813
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
814
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
815
- (rotary_emb): Qwen2RotaryEmbedding()
816
- )
817
- (mlp): Qwen2MLP(
818
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
819
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
820
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
821
- (act_fn): SiLU()
822
- )
823
- (input_layernorm): Qwen2RMSNorm()
824
- (post_attention_layernorm): Qwen2RMSNorm()
825
- )
826
- (14): Qwen2DecoderLayer(
827
- (self_attn): Qwen2Attention(
828
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
829
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
830
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
831
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
832
- (rotary_emb): Qwen2RotaryEmbedding()
833
- )
834
- (mlp): Qwen2MLP(
835
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
836
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
837
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
838
- (act_fn): SiLU()
839
- )
840
- (input_layernorm): Qwen2RMSNorm()
841
- (post_attention_layernorm): Qwen2RMSNorm()
842
- )
843
- (15): Qwen2DecoderLayer(
844
- (self_attn): Qwen2Attention(
845
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
846
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
847
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
848
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
849
- (rotary_emb): Qwen2RotaryEmbedding()
850
- )
851
- (mlp): Qwen2MLP(
852
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
853
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
854
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
855
- (act_fn): SiLU()
856
- )
857
- (input_layernorm): Qwen2RMSNorm()
858
- (post_attention_layernorm): Qwen2RMSNorm()
859
- )
860
- (16): Qwen2DecoderLayer(
861
- (self_attn): Qwen2Attention(
862
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
863
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
864
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
865
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
866
- (rotary_emb): Qwen2RotaryEmbedding()
867
- )
868
- (mlp): Qwen2MLP(
869
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
870
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
871
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
872
- (act_fn): SiLU()
873
- )
874
- (input_layernorm): Qwen2RMSNorm()
875
- (post_attention_layernorm): Qwen2RMSNorm()
876
- )
877
- (17): Qwen2DecoderLayer(
878
- (self_attn): Qwen2Attention(
879
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
880
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
881
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
882
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
883
- (rotary_emb): Qwen2RotaryEmbedding()
884
- )
885
- (mlp): Qwen2MLP(
886
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
887
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
888
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
889
- (act_fn): SiLU()
890
- )
891
- (input_layernorm): Qwen2RMSNorm()
892
- (post_attention_layernorm): Qwen2RMSNorm()
893
- )
894
- (18): Qwen2DecoderLayer(
895
- (self_attn): Qwen2Attention(
896
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
897
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
898
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
899
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
900
- (rotary_emb): Qwen2RotaryEmbedding()
901
- )
902
- (mlp): Qwen2MLP(
903
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
904
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
905
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
906
- (act_fn): SiLU()
907
- )
908
- (input_layernorm): Qwen2RMSNorm()
909
- (post_attention_layernorm): Qwen2RMSNorm()
910
- )
911
- (19): Qwen2DecoderLayer(
912
- (self_attn): Qwen2Attention(
913
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
914
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
915
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
916
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
917
- (rotary_emb): Qwen2RotaryEmbedding()
918
- )
919
- (mlp): Qwen2MLP(
920
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
921
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
922
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
923
- (act_fn): SiLU()
924
- )
925
- (input_layernorm): Qwen2RMSNorm()
926
- (post_attention_layernorm): Qwen2RMSNorm()
927
- )
928
- (20): Qwen2DecoderLayer(
929
- (self_attn): Qwen2Attention(
930
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
931
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
932
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
933
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
934
- (rotary_emb): Qwen2RotaryEmbedding()
935
- )
936
- (mlp): Qwen2MLP(
937
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
938
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
939
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
940
- (act_fn): SiLU()
941
- )
942
- (input_layernorm): Qwen2RMSNorm()
943
- (post_attention_layernorm): Qwen2RMSNorm()
944
- )
945
- (21): Qwen2DecoderLayer(
946
- (self_attn): Qwen2Attention(
947
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
948
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
949
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
950
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
951
- (rotary_emb): Qwen2RotaryEmbedding()
952
- )
953
- (mlp): Qwen2MLP(
954
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
955
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
956
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
957
- (act_fn): SiLU()
958
- )
959
- (input_layernorm): Qwen2RMSNorm()
960
- (post_attention_layernorm): Qwen2RMSNorm()
961
- )
962
- (22): Qwen2DecoderLayer(
963
- (self_attn): Qwen2Attention(
964
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
965
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
966
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
967
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
968
- (rotary_emb): Qwen2RotaryEmbedding()
969
- )
970
- (mlp): Qwen2MLP(
971
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
972
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
973
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
974
- (act_fn): SiLU()
975
- )
976
- (input_layernorm): Qwen2RMSNorm()
977
- (post_attention_layernorm): Qwen2RMSNorm()
978
- )
979
- (23): Qwen2DecoderLayer(
980
- (self_attn): Qwen2Attention(
981
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
982
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
983
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
984
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
985
- (rotary_emb): Qwen2RotaryEmbedding()
986
- )
987
- (mlp): Qwen2MLP(
988
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
989
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
990
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
991
- (act_fn): SiLU()
992
- )
993
- (input_layernorm): Qwen2RMSNorm()
994
- (post_attention_layernorm): Qwen2RMSNorm()
995
- )
996
- )
997
- (norm): Qwen2RMSNorm()
998
- )
999
- (lm_head): Linear(in_features=896, out_features=151936, bias=False)
1000
- )
1001
- (input_projetor): Linear(in_features=1024, out_features=896, bias=True)
1002
- )