vincelwt commited on
Commit
97826e4
1 Parent(s): 7e5cb25

Fixes and updates

Browse files
app/compare/[slugs]/page.js CHANGED
@@ -1,4 +1,3 @@
1
- import Link from "next/link"
2
  import db, { getModels } from "@/utils/db"
3
 
4
  export default async function Comparison({ params }) {
@@ -54,9 +53,11 @@ export default async function Comparison({ params }) {
54
  </td>
55
  <td>
56
  <pre>{row.model1?.result?.trim()}</pre>
 
57
  </td>
58
  <td>
59
  <pre>{row.model2?.result?.trim()}</pre>
 
60
  </td>
61
  </tr>
62
  ))}
 
 
1
  import db, { getModels } from "@/utils/db"
2
 
3
  export default async function Comparison({ params }) {
 
53
  </td>
54
  <td>
55
  <pre>{row.model1?.result?.trim()}</pre>
56
+ <p>{row.model1 ? `Score: ${row.model1?.score}` : "Not rated"}</p>
57
  </td>
58
  <td>
59
  <pre>{row.model2?.result?.trim()}</pre>
60
+ <p>{row.model2 ? `Score: ${row.model2?.score}` : "Not rated"}</p>
61
  </td>
62
  </tr>
63
  ))}
app/layout.js CHANGED
@@ -1,6 +1,7 @@
1
  import Link from "next/link"
2
  import "@/styles/globals.css"
3
  import { Suspense } from "react"
 
4
 
5
  export const metadata = {
6
  title: "LLMonitor Benchmarks",
@@ -10,6 +11,17 @@ export const metadata = {
10
  export default function RootLayout({ children }) {
11
  return (
12
  <html lang="en">
 
 
 
 
 
 
 
 
 
 
 
13
  <body>
14
  <main>
15
  <h1>LLMonitor Benchmarks</h1>
@@ -32,7 +44,7 @@ export default function RootLayout({ children }) {
32
  <p>
33
  Credit:{" "}
34
  <a href="https://twitter.com/vincelwt" target="_blank">
35
- @vincelwt
36
  </a>{" "}
37
  /{" "}
38
  <a href="https://llmonitor.com" target="_blank">
 
1
  import Link from "next/link"
2
  import "@/styles/globals.css"
3
  import { Suspense } from "react"
4
+ import PlausibleProvider from "next-plausible"
5
 
6
  export const metadata = {
7
  title: "LLMonitor Benchmarks",
 
11
  export default function RootLayout({ children }) {
12
  return (
13
  <html lang="en">
14
+ <head>
15
+ <PlausibleProvider
16
+ domain="benchmarks.llmonitor.com"
17
+ scriptProps={{
18
+ src: "https://llmonitor.com/p/js/script.js",
19
+ // @ts-ignore
20
+ "data-api": "https://llmonitor.com/p/event",
21
+ }}
22
+ customDomain="benchmarks.llmonitor.com"
23
+ />
24
+ </head>
25
  <body>
26
  <main>
27
  <h1>LLMonitor Benchmarks</h1>
 
44
  <p>
45
  Credit:{" "}
46
  <a href="https://twitter.com/vincelwt" target="_blank">
47
+ @vincelwt\
48
  </a>{" "}
49
  /{" "}
50
  <a href="https://llmonitor.com" target="_blank">
app/page.js CHANGED
@@ -1,17 +1,10 @@
1
- import db from "@/utils/db"
2
  import Link from "next/link"
3
 
4
  export default async function Leaderboard() {
5
  const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
6
 
7
- const models = await db`
8
- SELECT models.*, SUM(results.score) as total_score
9
- FROM models
10
- LEFT JOIN results ON models.id = results.model
11
- GROUP BY models.id
12
- ORDER BY total_score DESC;
13
- `
14
-
15
  return (
16
  <>
17
  <p>
@@ -49,7 +42,7 @@ export default async function Leaderboard() {
49
  .filter((s) => s.total_score)
50
  .map((model, i) => (
51
  <tr key={i}>
52
- <td>{i + 1}</td>
53
  <td>{model.name}</td>
54
  <td>
55
  {parseInt((model.total_score / potentialPoints.total) * 100)}
 
1
+ import db, { getModels } from "@/utils/db"
2
  import Link from "next/link"
3
 
4
  export default async function Leaderboard() {
5
  const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
6
 
7
+ const models = await getModels()
 
 
 
 
 
 
 
8
  return (
9
  <>
10
  <p>
 
42
  .filter((s) => s.total_score)
43
  .map((model, i) => (
44
  <tr key={i}>
45
+ <td>{model.rank}</td>
46
  <td>{model.name}</td>
47
  <td>
48
  {parseInt((model.total_score / potentialPoints.total) * 100)}
run/poetry.lock CHANGED
@@ -312,6 +312,20 @@ files = [
312
  {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
313
  ]
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  [[package]]
316
  name = "colorama"
317
  version = "0.4.6"
@@ -1051,6 +1065,17 @@ files = [
1051
  {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
1052
  ]
1053
 
 
 
 
 
 
 
 
 
 
 
 
1054
  [[package]]
1055
  name = "tenacity"
1056
  version = "8.2.3"
@@ -1079,6 +1104,26 @@ files = [
1079
  [package.extras]
1080
  tests = ["pytest", "pytest-cov"]
1081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1082
  [[package]]
1083
  name = "tokenizers"
1084
  version = "0.14.1"
@@ -1214,6 +1259,27 @@ notebook = ["ipywidgets (>=6)"]
1214
  slack = ["slack-sdk"]
1215
  telegram = ["requests"]
1216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1217
  [[package]]
1218
  name = "typing-extensions"
1219
  version = "4.8.0"
@@ -1332,4 +1398,4 @@ multidict = ">=4.0"
1332
  [metadata]
1333
  lock-version = "2.0"
1334
  python-versions = "^3.9"
1335
- content-hash = "2532aa0d8a88a1e9dc8e0b916abb81ff0ed1bfee5230c1f141139ceeeef5d305"
 
312
  {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
313
  ]
314
 
315
+ [[package]]
316
+ name = "click"
317
+ version = "8.1.7"
318
+ description = "Composable command line interface toolkit"
319
+ optional = false
320
+ python-versions = ">=3.7"
321
+ files = [
322
+ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
323
+ {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
324
+ ]
325
+
326
+ [package.dependencies]
327
+ colorama = {version = "*", markers = "platform_system == \"Windows\""}
328
+
329
  [[package]]
330
  name = "colorama"
331
  version = "0.4.6"
 
1065
  {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
1066
  ]
1067
 
1068
+ [[package]]
1069
+ name = "sseclient-py"
1070
+ version = "1.7.2"
1071
+ description = "SSE client for Python"
1072
+ optional = false
1073
+ python-versions = "*"
1074
+ files = [
1075
+ {file = "sseclient-py-1.7.2.tar.gz", hash = "sha256:ba3197d314766eccb72a1dda80b5fa14a0fbba07d796a287654c07edde88fe0f"},
1076
+ {file = "sseclient_py-1.7.2-py2.py3-none-any.whl", hash = "sha256:a758653b13b78df42cdb696740635a26cb72ad433b75efb68dbbb163d099b6a9"},
1077
+ ]
1078
+
1079
  [[package]]
1080
  name = "tenacity"
1081
  version = "8.2.3"
 
1104
  [package.extras]
1105
  tests = ["pytest", "pytest-cov"]
1106
 
1107
+ [[package]]
1108
+ name = "together"
1109
+ version = "0.2.4"
1110
+ description = "Python client for Together's Cloud Platform!"
1111
+ optional = false
1112
+ python-versions = ">=3.6"
1113
+ files = [
1114
+ {file = "together-0.2.4-py3-none-any.whl", hash = "sha256:fdf5b70e2d517e855fae5821e1ef8f164e938710d662fe3f4fadf5ac39f1c2a3"},
1115
+ {file = "together-0.2.4.tar.gz", hash = "sha256:85896985f41bcd6f308ac4d925d1827e915d1e5e65057f92e990610a3085c94a"},
1116
+ ]
1117
+
1118
+ [package.dependencies]
1119
+ requests = "*"
1120
+ sseclient-py = "1.7.2"
1121
+ tqdm = "*"
1122
+ typer = "*"
1123
+
1124
+ [package.extras]
1125
+ quality = ["black (>=23.1,<24.0)", "mypy (>=1.3.0)", "ruff (>=0.0.241,<=0.0.259)", "types-requests (>=2.31.0.1)", "types-tqdm (>=4.65.0.0)"]
1126
+
1127
  [[package]]
1128
  name = "tokenizers"
1129
  version = "0.14.1"
 
1259
  slack = ["slack-sdk"]
1260
  telegram = ["requests"]
1261
 
1262
+ [[package]]
1263
+ name = "typer"
1264
+ version = "0.9.0"
1265
+ description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
1266
+ optional = false
1267
+ python-versions = ">=3.6"
1268
+ files = [
1269
+ {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
1270
+ {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
1271
+ ]
1272
+
1273
+ [package.dependencies]
1274
+ click = ">=7.1.1,<9.0.0"
1275
+ typing-extensions = ">=3.7.4.3"
1276
+
1277
+ [package.extras]
1278
+ all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
1279
+ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
1280
+ doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
1281
+ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
1282
+
1283
  [[package]]
1284
  name = "typing-extensions"
1285
  version = "4.8.0"
 
1398
  [metadata]
1399
  lock-version = "2.0"
1400
  python-versions = "^3.9"
1401
+ content-hash = "3b888e591a06f7343d7ee83a93fa52e86b3ad6aec53614bb2d25e8703307af3e"
run/pyproject.toml CHANGED
@@ -16,6 +16,7 @@ hugchat = {git = "https://github.com/Soulter/hugging-chat-api", rev = "master"}
16
  psycopg2-binary = "^2.9.9"
17
  anthropic = "^0.3.11"
18
  tenacity = "^8.2.3"
 
19
 
20
  [build-system]
21
  requires = ["poetry-core"]
 
16
  psycopg2-binary = "^2.9.9"
17
  anthropic = "^0.3.11"
18
  tenacity = "^8.2.3"
19
+ together = "^0.2.4"
20
 
21
  [build-system]
22
  requires = ["poetry-core"]
run/queriers.py CHANGED
@@ -2,17 +2,11 @@ import openai
2
  import os
3
  import json
4
  import requests
5
- from llmonitor import monitor
6
  from hugchat import hugchat
7
  from hugchat.login import Login
 
8
  from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
9
 
10
- from tenacity import (
11
- retry,
12
- stop_after_attempt,
13
- wait_exponential,
14
- wait_random_exponential,
15
- ) # for exponential backoff
16
 
17
  from dotenv import load_dotenv
18
  load_dotenv()
@@ -30,9 +24,7 @@ ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
30
  HUGGING_EMAIL = os.environ.get("HUGGING_EMAIL")
31
  HUGGING_PASSWORD = os.environ.get("HUGGING_PASSWORD")
32
 
33
- MAX_TOKENS = 600
34
-
35
- monitor(openai)
36
 
37
 
38
  # Log in to huggingface and grant authorization to huggingchat
@@ -69,33 +61,29 @@ def hugchat_func(model, params):
69
 
70
  return query_result['text']
71
 
72
- def together(model, params):
73
- def format_prompt(prompt, prompt_type):
74
- if prompt_type == "language":
75
- return f"Q: {prompt}\nA: "
76
- if prompt_type == "code":
77
- return f"# {prompt}"
78
- if prompt_type == "chat":
79
- return f"\n<human>: {prompt}\n<bot>: "
80
 
81
- url = "https://api.together.xyz/inference"
82
- headers = {
83
- "Content-Type": "application/json",
84
- "Authorization": f"Bearer {TOGETHER_API_KEY}",
85
- }
86
 
87
- data = {
88
- "model": model['api_id'],
89
- "prompt": format_prompt(params['text'], model['type']),
90
- "stop": "\n<human>" if model['type'] == "chat" else params.get('stop', None),
91
- "temperature": 0,
92
- "max_tokens": MAX_TOKENS,
93
- }
 
 
 
94
 
95
- response = requests.post(url, headers=headers, data=json.dumps(data))
96
- result = response.json()
97
 
98
- return result['output']['choices'][0]['text'].rstrip(params['stop'])
99
 
100
  def cohere(model, params):
101
  options = {
@@ -121,7 +109,6 @@ def cohere(model, params):
121
 
122
  return json_response['generations'][0]['text']
123
 
124
- @retry(wait=wait_exponential(multiplier=1, min=4, max=16))
125
  def openai_func(model, params):
126
 
127
  openai.api_key = OPENAI_API_KEY
 
2
  import os
3
  import json
4
  import requests
 
5
  from hugchat import hugchat
6
  from hugchat.login import Login
7
+ import together
8
  from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
9
 
 
 
 
 
 
 
10
 
11
  from dotenv import load_dotenv
12
  load_dotenv()
 
24
  HUGGING_EMAIL = os.environ.get("HUGGING_EMAIL")
25
  HUGGING_PASSWORD = os.environ.get("HUGGING_PASSWORD")
26
 
27
+ MAX_TOKENS = 700
 
 
28
 
29
 
30
  # Log in to huggingface and grant authorization to huggingchat
 
61
 
62
  return query_result['text']
63
 
64
+ def together_func(model, params):
65
+ # def format_prompt(prompt, prompt_type):
66
+ # if prompt_type == "language":
67
+ # return f"Q: {prompt}\nA: "
68
+ # if prompt_type == "code":
69
+ # return f"# {prompt}"
70
+ # if prompt_type == "chat":
71
+ # return f"<human>: {prompt}\n<bot>: "
72
 
 
 
 
 
 
73
 
74
+ together.api_key = TOGETHER_API_KEY
75
+
76
+ # generate response
77
+ response = together.Complete.create(
78
+ model = model['api_id'],
79
+ prompt=f"<human>: {params['text']}\n<bot>:",
80
+ temperature=0,
81
+ max_tokens=MAX_TOKENS,
82
+ stop=["<human>", "<human>:","</s>", "<|end|>", "<|endoftext|>", "<bot>", "```\n```", "\nUser"]
83
+ )
84
 
 
 
85
 
86
+ return response['output']['choices'][0]['text'].rstrip(params['stop'])
87
 
88
  def cohere(model, params):
89
  options = {
 
109
 
110
  return json_response['generations'][0]['text']
111
 
 
112
  def openai_func(model, params):
113
 
114
  openai.api_key = OPENAI_API_KEY
run/run.py CHANGED
@@ -2,14 +2,23 @@ import sqlite3
2
  import time
3
  from termcolor import colored
4
  import psycopg2
5
- from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha, hugchat_func, anthropic_func
6
  import psycopg2.extras
7
  import psycopg2.pool
 
8
 
9
  import os
10
  from dotenv import load_dotenv
11
  load_dotenv()
12
 
 
 
 
 
 
 
 
 
13
  # Connect to database
14
  PG_URI = os.environ.get("POSTGRES_URL")
15
 
@@ -66,7 +75,7 @@ def ask_prompt(prompt, model):
66
  return
67
 
68
  mapping = {
69
- "together": together,
70
  "cohere": cohere, # Add these functions to the mapping once they are translated
71
  "openai": openai_func,
72
  "openrouter": openrouter,
@@ -134,7 +143,42 @@ for model in models:
134
  # Calculate scores
135
  results = get_results()
136
 
137
- #@agent(name="RateResult")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def rate_result(result):
139
  cursor.execute(
140
  "SELECT * FROM rubrics WHERE prompt = %s",
@@ -162,20 +206,9 @@ def rate_result(result):
162
  if result["result"].strip() == "":
163
  score = 0
164
  else:
165
- grading_text = (
166
- f'You help me grade the answer of chatbots by verifying that they match this condition: the answer {rubric["grading"]}. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be. Your rating needs to be very strict: if I ask that the answer is *exactly* some string and it contains more than that, then it\'s invalid.\n\n'
167
- f'\n\n--START OF THE ANSWER--\n{result["result"]}\n--END OF THE ANSWER--\n\n'
168
- # f'Take a deep breath and explain step by step how you come to the conclusion.'
169
- # f'Finally, reply on the last line with YES if the following answer matches this condition (otherwies reply NO).'
170
- f'Reply with YES if the text between START and END matches exactly the above condition (otherwise reply NO).'
171
- )
172
 
173
- # get gpt-4 model
174
- gpt4 = next((item for item in models if item['api_id'] == 'gpt-4'), None)
175
 
176
- prompt = { }
177
-
178
- response_text = openai_func(gpt4, {"text": grading_text})
179
 
180
  print(colored(f"-> {response_text}", 'yellow'))
181
 
 
2
  import time
3
  from termcolor import colored
4
  import psycopg2
5
+ from queriers import together_func, cohere, openai_func, openrouter, ai21, alephalpha, hugchat_func, anthropic_func
6
  import psycopg2.extras
7
  import psycopg2.pool
8
+ import openai
9
 
10
  import os
11
  from dotenv import load_dotenv
12
  load_dotenv()
13
 
14
+ from llmonitor import monitor, agent, tool
15
+ from tenacity import (
16
+ retry,
17
+ wait_exponential,
18
+ ) # for exponential backoff
19
+
20
+ monitor(openai)
21
+
22
  # Connect to database
23
  PG_URI = os.environ.get("POSTGRES_URL")
24
 
 
75
  return
76
 
77
  mapping = {
78
+ "together": together_func,
79
  "cohere": cohere, # Add these functions to the mapping once they are translated
80
  "openai": openai_func,
81
  "openrouter": openrouter,
 
143
  # Calculate scores
144
  results = get_results()
145
 
146
+ # @tool(name='grade_openai')
147
+ @retry(wait=wait_exponential(multiplier=1, min=4, max=16))
148
+ def grade_openai(rubric, result):
149
+
150
+ openai.api_key = os.getenv('OPENAI_API_KEY')
151
+
152
+ messages = [{
153
+ "role": "system",
154
+ "content": "You help me grade the answer of chatbots by verifying that they match this assertion. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be. Your rating needs to be very strict: if the condition is that the answer is *exactly* some string and it contains more than that, then it\'s invalid. Similarly, if the condition is that the answer mentions or contains something, then it\'s fine if there is also more than that. Reply with YES if the answer matches the assertion, and NO if it doesn\'t."
155
+ }, {
156
+ "role": "user",
157
+ "content": 'Assertion: The answer mentions many countries not (officially) recognising taiwan as being independent\n\n--START OF THE ANSWER--\nTaiwan is a territory in East Asia with its own government and economy. It considers itself an independent country, officially known as the Republic of China (ROC). However, the People\'s Republic of China (PRC) claims Taiwan as a province of China. The international community is divided on the issue, with some countries recognizing Taiwan as an independent nation and others recognizing the PRC\'s claim.\n--END OF THE ANSWER--'
158
+ }, {
159
+ "role": "assistant",
160
+ "content": "YES"
161
+ }, {
162
+ "role": "user",
163
+ "content": 'Assertion: The answer contains only a valid JSON and nothing else\n\n--START OF THE ANSWER--\nHere is the JSON array with the 5 planets closest to the sun:\n\n```json\n[\n{\n"planet": "Mercury",\n"distanceFromEarth": 77.3,\n"diameter": 4879,\n"moons": 0\n}\n]\n```\n--END OF THE ANSWER--'
164
+ }, {
165
+ "role": "assistant",
166
+ "content": "NO"
167
+ }, {
168
+ "role": "user",
169
+ "content": f"Assertion: The answer {rubric['grading']}\n\n--START OF THE ANSWER--\n{result['result']}\n--END OF THE ANSWER--\n\n"
170
+ }]
171
+
172
+ completion = openai.ChatCompletion.create(
173
+ model='gpt-4',
174
+ messages=messages,
175
+ temperature=0,
176
+ max_tokens=100
177
+ )
178
+
179
+ return completion.choices[0].message.content
180
+
181
+ @agent(name="RateResult")
182
  def rate_result(result):
183
  cursor.execute(
184
  "SELECT * FROM rubrics WHERE prompt = %s",
 
206
  if result["result"].strip() == "":
207
  score = 0
208
  else:
 
 
 
 
 
 
 
209
 
 
 
210
 
211
+ response_text = grade_openai(rubric, result)
 
 
212
 
213
  print(colored(f"-> {response_text}", 'yellow'))
214
 
run/together_cleaner.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This cleans up the results from the together API by removing the stop tokens, for some reason the API doesn't do this itself.
2
+
3
+ import psycopg2
4
+ import psycopg2.extras
5
+ import psycopg2.pool
6
+ import os
7
+
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ # Connect to database
12
+ PG_URI = os.environ.get("POSTGRES_URL")
13
+ conn = psycopg2.connect(PG_URI)
14
+ cur = conn.cursor()
15
+
16
+ # Execute the SQL query
17
+ cur.execute("SELECT result FROM results INNER JOIN models ON results.model = models.id WHERE models.api = 'together'")
18
+
19
+ # Fetch all the rows
20
+ rows = cur.fetchall()
21
+
22
+ str_array = ["<human>", "<human>:", "</bot>", "</s>", "<|end|>", "<|endoftext|>", "```\n```", "\nUser"]
23
+
24
+
25
+
26
+ for row in rows:
27
+ for string in str_array:
28
+ if string in row[0]:
29
+ print("Found string: " + string)
30
+ # Find the index of the string
31
+ index = row[0].index(string)
32
+ # Remove the string and everything after it
33
+ new_result = row[0][:index].strip()
34
+ # Update the result in the database
35
+ print('===============================')
36
+ print("Old result:" + row[0])
37
+ print("New result:" + new_result)
38
+
39
+ cur.execute("UPDATE results SET result = %s WHERE result = %s", (new_result, row[0]))
40
+
41
+ conn.commit()
42
+ conn.close()
utils/db.js CHANGED
@@ -12,10 +12,25 @@ export const getModels = cache(async () => {
12
  ORDER BY total_score DESC;
13
  `
14
 
15
- return models.map((m) => ({
16
- ...m,
17
- slug: m.api_id.split("/").pop().toLowerCase(),
18
- }))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  })
20
 
21
  export default sql
 
12
  ORDER BY total_score DESC;
13
  `
14
 
15
+ console.log("models", models)
16
+
17
+ const sorted = models.sort((a, b) => b.total_score - a.total_score)
18
+
19
+ // set the rank, so that if two models have the same score, they have the same rank
20
+ for (let i = 0; i < sorted.length; i++) {
21
+ const model = sorted[i]
22
+ const previousModel = sorted[i - 1]
23
+
24
+ if (previousModel && previousModel.total_score === model.total_score) {
25
+ model.rank = previousModel.rank
26
+ } else {
27
+ model.rank = previousModel ? previousModel.rank + 1 : 1
28
+ }
29
+
30
+ model.slug = model.api_id.split("/").pop().toLowerCase()
31
+ }
32
+
33
+ return sorted
34
  })
35
 
36
  export default sql