vincelwt commited on
Commit
34dd66d
1 Parent(s): 12e3264

add other url

Browse files
Files changed (1) hide show
  1. app/leaderboard/page.js +63 -0
app/leaderboard/page.js ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import db, { getModels } from "@/utils/db"
2
+ import Link from "next/link"
3
+
4
+ export default async function Leaderboard() {
5
+ const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
6
+
7
+ const models = await getModels()
8
+ return (
9
+ <>
10
+ <p>
11
+ Traditional LLMs benchmarks have drawbacks: they quickly become part of
12
+ training datasets and are hard to relate to in terms of real-world
13
+ use-cases.
14
+ </p>
15
+ <p>
16
+ I made this as an experiment to address these issues. Here, the dataset
17
+ is dynamic (changes every week) and composed of crowdsourced real-world
18
+ prompts.
19
+ </p>
20
+ <p>
21
+ We then use GPT-4 to grade each model's response against a set of
22
+ rubrics (more details on the about page). The prompt dataset is easily
23
+ explorable.
24
+ </p>
25
+ <p>
26
+ Everything is then stored in a Postgres database and this page shows the
27
+ raw results.
28
+ </p>
29
+
30
+ <br />
31
+ <table style={{ maxWidth: 600 }}>
32
+ <thead>
33
+ <tr>
34
+ <th width={70}>Rank</th>
35
+ <th width={250}>Model</th>
36
+ <th>Score</th>
37
+ <th>Results</th>
38
+ </tr>
39
+ </thead>
40
+ <tbody>
41
+ {models
42
+ .filter((s) => s.total_score)
43
+ .map((model, i) => (
44
+ <tr key={i}>
45
+ <td>{model.rank}</td>
46
+ <td>{model.name}</td>
47
+ <td>
48
+ {parseInt((model.total_score / potentialPoints.total) * 100)}
49
+ </td>
50
+ <td>
51
+ <Link
52
+ href={`/${model.api_id.split("/").pop().toLowerCase()}`}
53
+ >
54
+ view
55
+ </Link>
56
+ </td>
57
+ </tr>
58
+ ))}
59
+ </tbody>
60
+ </table>
61
+ </>
62
+ )
63
+ }