Update README.md
Browse files
README.md
CHANGED
@@ -53,49 +53,75 @@ InternVL 2.5 is a multimodal large language model series, featuring models of va
|
|
53 |
|
54 |
### Image Benchmarks
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
| Benchmark |Ovis1.6-Gemma2-9B | MiniCPM-V2.6 | Molmo-7B-D | Qwen2-VL-7B | InternVL2-8B | InternVL2.5-8B |
|
58 |
-
|---------------------|------------------|--------------|-------- |---------------| --------------| -------------- |
|
59 |
-
| MMMU (val) |55.0 | 49.8 | - |54.1 | 52.6 | 56.0 |
|
60 |
-
| MMMU (test) |-- | -- | - |- | 44.3 | 48.9 |
|
61 |
-
| MMMU-PRO (overall) |-- | 27.2 | - | 30.5 | 29.0 | 34.3 |
|
62 |
-
| MathVista (mini) | 67.2 | 60.6 | - |58.2 | 58.3 | 64.4 |
|
63 |
-
| MathVision (mini) | - | 16.1 | - |22.0 | 20.4 | 22.0 |
|
64 |
-
| MathVision (full) | 18.8 | 17.5 | - |16.3 | 18.4 | 19.7 |
|
65 |
-
| MathVerse (mini) | -- | 25.7 | - |31.9 | 37.0 | 39.5 |
|
66 |
-
| Olympiad Bench | -- | -- | - | - | 1.9 | 4.9 |
|
67 |
-
| AI2D (w / wo M) | 84.4 / - | 82.1 / - | - / 93.2 | 83.0 / 92.1 | 83.8 / 91.7 | 84.5 / 92.8 |
|
68 |
-
| ChartQA (test avg.) |-- | 82.4 | 84.1 | 83.0 | 83.3 | 84.8 |
|
69 |
-
| TextVQA (val) |-- | 80.1 | 81.7 | 84.3 | 77.4 | 79.1 |
|
70 |
-
| DocVQA (test) |-- | 90.8 | 92.2 | 94.5 | 91.6 | 93.0 |
|
71 |
-
| InfoVQA (test) | -- | -- | 72.6 | 76.5 | 74.8 | 77.6 |
|
72 |
-
| OCR-Bench |830 | 852 | 694 | 866 | 794 | 822 |
|
73 |
-
| SEED-2 Plus |-- | 65.7 | -- | 69.0 | 67.5 | 69.7 |
|
74 |
-
| CharXiv (RQ / DQ) |-- | 31.0 / 57.1 | -- | -- | 31.2 / 56.1 | 32.9 / 68.6 |
|
75 |
-
| VCR-EN-Easy (EM / Jaccard) |-- | 73.9 / 85.7 | -- | 89.7 / 93.8 | 37.9 / 61.5 | 92.6 / 97.4 |
|
76 |
-
| BLINK (val) |- | 53.0 | - | 53.2 | 50.9 | 54.8 |
|
77 |
-
| Mantis Eval |- | 69.0 | - | - | 65.4 | 67.7 |
|
78 |
-
| MMIU |- | - | - | - | 42.0 | 46.7 |
|
79 |
-
| Muir Bench |- | - | - | - | 48.7 | 51.1 |
|
80 |
-
| MMT (val) |- | 60.8 | - | 64.0 | 60.0 | 62.3 |
|
81 |
-
| MIRB (avg.) |- | - | - | - | 50.0 | 52.5 |
|
82 |
-
| RealWorld QA |- | 65.0 | - | 70.1 | 64.4 | 70.1 |
|
83 |
-
| MME-RW (EN) |- | - | - | 56.5 | 53.5 | 59.1 |
|
84 |
-
| WildVision (win rate)|- | - | - | - | 54.4 | 62.0 |
|
85 |
-
| R-Bench |- | - | - | 64.0 | 67.9 | 70.1 |
|
86 |
-
| MME (sum) |- | 2348.4 | - | 2326.8 | 2210.3 | 2344.1 |
|
87 |
-
| MMB (EN / CN) | - | 81.5 / 79.3 | - | 83.0 / 80.5 | 81.7 / 81.2 | 84.6 / 82.6 |
|
88 |
-
| MMBv1.1 (EN) | - | 78.0 | - | 80.7 | 79.5 | 83.2 |
|
89 |
-
| MMVet (turbo) | - | 60.0 | - | 62.0 | 54.2 | 62.8 |
|
90 |
-
| MMVetv2 (0613) | - | - | - | -- | 52.3 | 58.1 |
|
91 |
-
| MMStar | - | 57.5 | - | 60.7 | 62.0 | 62.8 |
|
92 |
-
| HallBench (avg.) | - | 48.1 | - | 50.6 | 45.2 | 50.1 |
|
93 |
-
| MMHal (score) | - | 3.60 | - | 3.40 | 3.33 | 3.65 |
|
94 |
-
| CRPE (relation) | - | 75.2 | - | 74.4 | 75.8 | 78.4 |
|
95 |
-
| POPE (avg.) | - | 87.3 | - | 88.1 | 86.9 | 90.6 |
|
96 |
|
97 |
### Video Benchmarks
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
### Multimodal Multilingual Understanding
|
100 |
|
101 |
<table style="width:100%; font-size: small; border-collapse: collapse; text-align: center;">
|
|
|
53 |
|
54 |
### Image Benchmarks
|
55 |
|
56 |
+
| Benchmark | Ovis1.6-Gemma2-9B | MiniCPM-V2.6 | Molmo-7B-D | Qwen2-VL-7B | InternVL2.5-8B |
|
57 |
+
|----------------------------|-------------------|--------------|------------|-------------|----------------|
|
58 |
+
| MMMU (val) | 55.0 | 49.8 | - | 54.1 | 56.0 |
|
59 |
+
| MMMU (test) | -- | -- | - | - | 48.9 |
|
60 |
+
| MMMU-PRO (overall) | -- | 27.2 | - | 30.5 | 34.3 |
|
61 |
+
| MathVista (mini) | 67.2 | 60.6 | - | 58.2 | 64.4 |
|
62 |
+
| MathVision (mini) | - | 16.1 | - | 22.0 | 22.0 |
|
63 |
+
| MathVision (full) | 18.8 | 17.5 | - | 16.3 | 19.7 |
|
64 |
+
| MathVerse (mini) | -- | 25.7 | - | 31.9 | 39.5 |
|
65 |
+
| Olympiad Bench | -- | -- | - | - | 4.9 |
|
66 |
+
| AI2D (w / wo M) | 84.4 / - | 82.1 / - | - / 93.2 | 83.0 / 92.1 | 84.5 / 92.8 |
|
67 |
+
| ChartQA (test avg.) | -- | 82.4 | 84.1 | 83.0 | 84.8 |
|
68 |
+
| TextVQA (val) | -- | 80.1 | 81.7 | 84.3 | 79.1 |
|
69 |
+
| DocVQA (test) | -- | 90.8 | 92.2 | 94.5 | 93.0 |
|
70 |
+
| InfoVQA (test) | -- | -- | 72.6 | 76.5 | 77.6 |
|
71 |
+
| OCR-Bench | 830 | 852 | 694 | 866 | 822 |
|
72 |
+
| SEED-2 Plus | -- | 65.7 | -- | 69.0 | 69.7 |
|
73 |
+
| CharXiv (RQ / DQ) | -- | 31.0 / 57.1 | -- | -- | 32.9 / 68.6 |
|
74 |
+
| VCR-EN-Easy (EM / Jaccard) | -- | 73.9 / 85.7 | -- | 89.7 / 93.8 | 92.6 / 97.4 |
|
75 |
+
| BLINK (val) | - | 53.0 | - | 53.2 | 54.8 |
|
76 |
+
| Mantis Eval | - | 69.0 | - | - | 67.7 |
|
77 |
+
| MMIU | - | - | - | - | 46.7 |
|
78 |
+
| Muir Bench | - | - | - | - | 51.1 |
|
79 |
+
| MMT (val) | - | 60.8 | - | 64.0 | 62.3 |
|
80 |
+
| MIRB (avg.) | - | - | - | - | 52.5 |
|
81 |
+
| RealWorld QA | - | 65.0 | - | 70.1 | 70.1 |
|
82 |
+
| MME-RW (EN) | - | - | - | 56.5 | 59.1 |
|
83 |
+
| WildVision (win rate) | - | - | - | - | 62.0 |
|
84 |
+
| R-Bench | - | - | - | 64.0 | 70.1 |
|
85 |
+
| MME (sum) | - | 2348.4 | - | 2326.8 | 2344.1 |
|
86 |
+
| MMB (EN / CN) | - | 81.5 / 79.3 | - | 83.0 / 80.5 | 84.6 / 82.6 |
|
87 |
+
| MMBv1.1 (EN) | - | 78.0 | - | 80.7 | 83.2 |
|
88 |
+
| MMVet (turbo) | - | 60.0 | - | 62.0 | 62.8 |
|
89 |
+
| MMVetv2 (0613) | - | - | - | -- | 58.1 |
|
90 |
+
| MMStar | - | 57.5 | - | 60.7 | 62.8 |
|
91 |
+
| HallBench (avg.) | - | 48.1 | - | 50.6 | 50.1 |
|
92 |
+
| MMHal (score) | - | 3.60 | - | 3.40 | 3.65 |
|
93 |
+
| CRPE (relation) | - | 75.2 | - | 74.4 | 78.4 |
|
94 |
+
| POPE (avg.) | - | 87.3 | - | 88.1 | 90.6 |
|
95 |
+
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
### Video Benchmarks
|
99 |
|
100 |
+
| Model Name | Video-MME (wo / w sub) | MVBench | MMBench-Video (val) | MLVU (M-Avg) | LongVideoBench (val total) | CG-Bench v1.1 (long / clue acc.) |
|
101 |
+
|---------------------------------------------|-------------|------|-------|-------|------|-------------|
|
102 |
+
| **InternVL2.5-1B** | 50.3 / 52.3 | 64.3 | 1.36 | 57.3 | 47.9 | - |
|
103 |
+
| Qwen2-VL-2B | 55.6 / 60.4 | 63.2 | - | - | - | - |
|
104 |
+
| **InternVL2.5-2B** | 51.9 / 54.1 | 68.8 | 1.44 | 61.4 | 52.0 | - |
|
105 |
+
| **InternVL2.5-4B** | 62.3 / 63.6 | 71.6 | 1.73 | 68.3 | 55.2 | - |
|
106 |
+
| VideoChat2-HD | 45.3 / 55.7 | 62.3 | 1.22 | 47.9 | - | - |
|
107 |
+
| MiniCPM-V-2.6 | 60.9 / 63.6 | - | 1.70 | - | 54.9 | - |
|
108 |
+
| LLaVA-OneVision-7B | 58.2 / - | 56.7 | - | - | - | - |
|
109 |
+
| Qwen2-VL-7B | 63.3 / 69.0 | 67.0 | 1.44 | - | 55.6 | - |
|
110 |
+
| **InternVL2.5-8B** | 64.2 / 66.9 | 72.0 | 1.68 | 68.9 | 60.0 | - |
|
111 |
+
| **InternVL2.5-26B** | 66.9 / 69.2 | 75.2 | 1.86 | 72.3 | 59.9 | - |
|
112 |
+
| Oryx-1.5-32B | 67.3 / 74.9 | 70.1 | 1.52 | 72.3 | - | - |
|
113 |
+
| VILA-1.5-40B | 60.1 / 61.1 | - | 1.61 | 56.7 | - | - |
|
114 |
+
| **InternVL2.5-38B** | 70.7 / 73.1 | 74.4 | 1.82 | 75.3 | 63.3 | - |
|
115 |
+
| GPT-4V/4T | 59.9 / 63.3 | 43.7 | 1.53 | 49.2 | 59.1 | - |
|
116 |
+
| GPT-4o-20240513 | 71.9 / 77.2 | - | 1.63 | 64.6 | 66.7 | - |
|
117 |
+
| GPT-4o-20240806 | - | - | 1.87 | - | - | - |
|
118 |
+
| Gemini-1.5-Pro | 75.0 / 81.3 | - | 1.30 | - | 64.0 | - |
|
119 |
+
| VideoLLaMA2-72B | 61.4 / 63.1 | 62.0 | - | - | - | - |
|
120 |
+
| LLaVA-OneVision-72B | 66.2 / 69.5 | 59.4 | - | 66.4 | 61.3 | - |
|
121 |
+
| Qwen2-VL-72B | 71.2 / 77.8 | 73.6 | 1.70 | - | - | 41.3 / 56.2 |
|
122 |
+
| InternVL2-Llama3-76B | 64.7 / 67.8 | 69.6 | 1.71 | 69.9 | 61.1 | - |
|
123 |
+
| **InternVL2.5-78B** | 72.1 / 74.0 | 76.4 | 1.97 | 75.7 | 63.6 | 42.2 / 58.5 |
|
124 |
+
|
125 |
### Multimodal Multilingual Understanding
|
126 |
|
127 |
<table style="width:100%; font-size: small; border-collapse: collapse; text-align: center;">
|