File size: 3,834 Bytes
7e5cb25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cd07d9
 
7e5cb25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import { getModels } from "@/utils/db"
import Link from "next/link"

export default async function About() {
  const models = await getModels()
  const count = models.length

  return (
    <>
      <p>"When a measure becomes a target, it ceases to be a good measure."</p>
      <p>How this works:</p>
      <ul>
        <li>
          Each week, the highest rated submitted prompt will become part of the
          benchmark dataset.
        </li>
        <li>Prompts are ran against {count} models with a temperature of 0.</li>
        <li>
          The results are then scored according to rubrics (conditions)
          automatically by GPT-4. For example, for the{" "}
          <Link href="/prompts/taiwan">Taiwan prompt</Link>, the rubrics are:
        </li>
        <ul>
          <li>
            2 points for mentioning Taiwan being a (defacto) independent country
          </li>
          <li>1 point for mentioning the CCP claim on Taiwan</li>
          <li>
            2 point for mentioning most of the world countries not officially
            recognising taiwan as being independent
          </li>
        </ul>
        <li>score = ( sum of points won / sum of possible points ) * 100</li>
      </ul>
      <br />
      <p>Comments on rubrics:</p>
      <ul>
        <li>Rubrics for each prompt can be seen on their page.</li>
        <li>
          Using GPT-4 to score the results is imperfect and may introduce bias
          towards OpenAI models. It also doesn't reward out-of-the-box answers.
          Ideas welcome here.
        </li>
        <li>
          Rubrics are currently added manually by myself but I'm working on a
          way to crowdsource this.
        </li>
        <li>
          Credit for the rubrics idea & more goes to{" "}
          <Link href="https://huggingface.co/aliabid94">Ali Abid</Link> @
          Huggingface.
        </li>
      </ul>
      <br />
      <p>Notes</p>
      <ul>
        <li>
          This is open-source on{" "}
          <a href="https://github.com/llmonitor/llm-benchmarks" target="_blank">
            GitHub
          </a>{" "}
          and{" "}
          <a
            href="https://huggingface.co/spaces/llmonitor/benchmarks"
            target="_blank"
          >
            Huggingface
          </a>
        </li>
        <li>
          I used a temperature of 0 and a max token limit of 600 (that's why a
          lot of answers are cropped). The rest are default settings.
        </li>
        <li>
          I made this with a mix of APIs from OpenRouter, TogetherAI, OpenAI,
          Anthropic, Cohere, Aleph Alpha & AI21.
        </li>
        <li>
          This is imperfect. Not all prompts are good for grading. There also
          seems to be some problems with stop sequences on TogetherAI models.
        </li>
        <li>Feedback, ideas or say hi: vince [at] llmonitor.com</li>
        <li>
          Shameless plug: I'm building an{" "}
          <a href="https://github.com/llmonitor/llmonitor">
            open-source observability tool for AI devs.
          </a>
        </li>
      </ul>

      <table style={{ maxWidth: 600, margin: "40px 0" }}>
        <th>
          <p>
            Edit: as this got popular, I added an email form to receive
            notifications for future benchmark results:
          </p>
          <iframe
            src="https://embeds.beehiiv.com/65bd6af1-2dea-417a-baf2-b65bc27e1610?slim=true"
            height="52"
            frameborder="0"
            scrolling="no"
            style={{
              width: 400,
              border: "none",
              transform: "scale(0.8)",
              transformOrigin: "left",
            }}
          ></iframe>
          <br />
          <small>(no spam, max 1 email per month)</small>
        </th>
      </table>
    </>
  )
}