Evaluation Results
|
AIME2024 |
MATH500 |
GPQA-Diamond |
AIME2025 Part I |
Bespoke-Stratos-32B |
63.3 |
93.0 |
58.1 |
- |
Sky-T1-32B |
43.3 |
82.4 |
56.8 |
- |
DeepSeek-R1-Distill-Qwen-32B |
66.7 |
89.8 |
61.1 |
53.3 |
OpenThinker-32B |
66.0 |
90.6 |
61.6 |
53.3 |
Control-R-32B (Ours) |
70.0 |
93.2 |
61.1 |
55.0 |
o1-preview |
40.0 |
81.4 |
75.2 |
78.3 |
DeepSeek-R1 |
79.8 |
97.3 |
71.5 |
65.0 |
Usage
Prompt
query = '...'
control_fields = "\n<control> search_depth: 9; search_breadth: 9; error_detection: 9; error_correction: 9; strategy_switching: 9; correctness: 9; efficiency: 9; completeness: 9; coherence: 9; knowledge_accuracy: 9; clarity_of_steps: 9 <control/>"
query += control_fields + "\nPlease reason step by step, and put your final answer within \\boxed{}.",