Added reproducibility journal metadata
Browse files- results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +3 -3
- results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +1 -1
- results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +2 -2
- results/GenericAgent-Claude-3.5-Sonnet/webarena.json +2 -2
- results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +2 -2
- results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +1 -1
- results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json +1 -1
- results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json +1 -1
- results/GenericAgent-GPT-4o-mini/assistantbench.json +3 -3
- results/GenericAgent-GPT-4o-mini/miniwob.json +1 -1
- results/GenericAgent-GPT-4o-mini/visualwebarena.json +2 -2
- results/GenericAgent-GPT-4o-mini/webarena.json +2 -2
- results/GenericAgent-GPT-4o-mini/weblinx.json +2 -2
- results/GenericAgent-GPT-4o-mini/workarena-l1.json +1 -1
- results/GenericAgent-GPT-4o-mini/workarena-l2.json +1 -1
- results/GenericAgent-GPT-4o-mini/workarena-l3.json +2 -2
- results/GenericAgent-GPT-4o/assistantbench.json +3 -3
- results/GenericAgent-GPT-4o/miniwob.json +1 -1
- results/GenericAgent-GPT-4o/visualwebarena.json +2 -2
- results/GenericAgent-GPT-4o/webarena.json +2 -2
- results/GenericAgent-GPT-4o/weblinx.json +2 -2
- results/GenericAgent-GPT-4o/workarena-l1.json +1 -1
- results/GenericAgent-GPT-4o/workarena-l2.json +1 -1
- results/GenericAgent-GPT-4o/workarena-l3.json +2 -2
- results/GenericAgent-GPT-o1-mini/assistantbench.json +3 -3
- results/GenericAgent-GPT-o1-mini/miniwob.json +2 -2
- results/GenericAgent-GPT-o1-mini/webarena.json +2 -2
- results/GenericAgent-GPT-o1-mini/weblinx.json +2 -2
- results/GenericAgent-GPT-o1-mini/workarena-l1.json +2 -2
- results/GenericAgent-GPT-o1-mini/workarena-l2.json +2 -2
- results/GenericAgent-GPT-o1-mini/workarena-l3.json +2 -2
- results/GenericAgent-Llama-3.1-405b/assistantbench.json +3 -3
- results/GenericAgent-Llama-3.1-405b/miniwob.json +2 -2
- results/GenericAgent-Llama-3.1-405b/webarena.json +2 -2
- results/GenericAgent-Llama-3.1-405b/weblinx.json +2 -2
- results/GenericAgent-Llama-3.1-405b/workarena-l1.json +2 -2
- results/GenericAgent-Llama-3.1-405b/workarena-l2.json +2 -2
- results/GenericAgent-Llama-3.1-405b/workarena-l3.json +2 -2
- results/GenericAgent-Llama-3.1-70b/assistantbench.json +3 -3
- results/GenericAgent-Llama-3.1-70b/miniwob.json +2 -2
- results/GenericAgent-Llama-3.1-70b/webarena.json +2 -2
- results/GenericAgent-Llama-3.1-70b/weblinx.json +2 -2
- results/GenericAgent-Llama-3.1-70b/workarena-l1.json +2 -2
- results/GenericAgent-Llama-3.1-70b/workarena-l2.json +2 -2
- results/GenericAgent-Llama-3.1-70b/workarena-l3.json +2 -2
results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 5.2,
|
7 |
"std_err": 1.5,
|
@@ -9,8 +9,8 @@
|
|
9 |
"benchmark_tuned": "No",
|
10 |
"followed_evaluation_protocol": "Yes",
|
11 |
"reproducible": "Yes",
|
12 |
-
"comments": "
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 5.2,
|
7 |
"std_err": 1.5,
|
|
|
9 |
"benchmark_tuned": "No",
|
10 |
"followed_evaluation_protocol": "Yes",
|
11 |
"reproducible": "Yes",
|
12 |
+
"comments": "Intersection of finished tasks across agents.",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-28 19:34:58"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 69.8,
|
7 |
"std_err": 1.8,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "2024-10-25_06-08-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 69.8,
|
7 |
"std_err": 1.8,
|
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "VisualWebArena",
|
6 |
"score": 21.0,
|
7 |
"std_err": 1.3,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
|
5 |
"benchmark": "VisualWebArena",
|
6 |
"score": 21.0,
|
7 |
"std_err": 1.3,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-12-02 09:11:35"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Claude-3.5-Sonnet/webarena.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 36.2,
|
7 |
"std_err": 1.7,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 36.2,
|
7 |
"std_err": 1.7,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-29 22:37:46"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 13.7,
|
7 |
"std_err": 0.6,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 13.7,
|
7 |
"std_err": 0.6,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-07 21:42:30"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 56.4,
|
7 |
"std_err": 2.7,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "2024-10-23_14-17-40",
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 56.4,
|
7 |
"std_err": 2.7,
|
results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 39.1,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "2024-10-23_17-10-46",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 39.1,
|
results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.4,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "2024-10-24_18-06-57",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.4,
|
results/GenericAgent-GPT-4o-mini/assistantbench.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "AssistantBench",
|
7 |
"score": 2.1,
|
8 |
"std_err": 1.0,
|
@@ -10,7 +10,7 @@
|
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
12 |
"reproducible": "Yes",
|
13 |
-
"comments": "
|
14 |
"original_or_reproduced": "Original"
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
+
"date_time": "2024-11-28 19:34:58",
|
6 |
"benchmark": "AssistantBench",
|
7 |
"score": 2.1,
|
8 |
"std_err": 1.0,
|
|
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
12 |
"reproducible": "Yes",
|
13 |
+
"comments": "Intersection of finished tasks across agents.",
|
14 |
"original_or_reproduced": "Original"
|
15 |
}
|
16 |
]
|
results/GenericAgent-GPT-4o-mini/miniwob.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
"score": 56.6,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "2024-10-25_06-08-16",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
"score": 56.6,
|
results/GenericAgent-GPT-4o-mini/visualwebarena.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "VisualWebArena",
|
7 |
"score": 16.9,
|
8 |
"std_err": 1.2,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4",
|
5 |
+
"date_time": "2024-12-02 02:54:33",
|
6 |
"benchmark": "VisualWebArena",
|
7 |
"score": 16.9,
|
8 |
"std_err": 1.2,
|
results/GenericAgent-GPT-4o-mini/webarena.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WebArena",
|
7 |
"score": 17.4,
|
8 |
"std_err": 1.3,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "c6bdeb87-9879-4c06-aa70-00d895001156",
|
5 |
+
"date_time": "2024-11-29 19:25:49",
|
6 |
"benchmark": "WebArena",
|
7 |
"score": 17.4,
|
8 |
"std_err": 1.3,
|
results/GenericAgent-GPT-4o-mini/weblinx.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WebLINX",
|
7 |
"score": 11.6,
|
8 |
"std_err": 0.6,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
+
"date_time": "2024-11-07 21:42:30",
|
6 |
"benchmark": "WebLINX",
|
7 |
"score": 11.6,
|
8 |
"std_err": 0.6,
|
results/GenericAgent-GPT-4o-mini/workarena-l1.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
"score": 27,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "2024-10-23_14-17-40",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
"score": 27,
|
results/GenericAgent-GPT-4o-mini/workarena-l2.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 1.3,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "2024-10-23_17-10-46",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 1.3,
|
results/GenericAgent-GPT-4o-mini/workarena-l3.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "-",
|
5 |
+
"date_time": "2024-10-24 23:03:30",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
results/GenericAgent-GPT-4o/assistantbench.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "AssistantBench",
|
7 |
"score": 4.8,
|
8 |
"std_err": 2.4,
|
@@ -10,7 +10,7 @@
|
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
12 |
"reproducible": "Yes",
|
13 |
-
"comments": "
|
14 |
"original_or_reproduced": "Original"
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
+
"date_time": "2024-11-28 19:34:58",
|
6 |
"benchmark": "AssistantBench",
|
7 |
"score": 4.8,
|
8 |
"std_err": 2.4,
|
|
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
12 |
"reproducible": "Yes",
|
13 |
+
"comments": "Intersection of finished tasks across agents.",
|
14 |
"original_or_reproduced": "Original"
|
15 |
}
|
16 |
]
|
results/GenericAgent-GPT-4o/miniwob.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
"score": 63.8,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "2024-10-25_06-08-16",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
"score": 63.8,
|
results/GenericAgent-GPT-4o/visualwebarena.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "VisualWebArena",
|
7 |
"score": 26.7,
|
8 |
"std_err": 1.5,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "7fb7eac8-4bbd-4ebe-be32-15901a7678f2",
|
5 |
+
"date_time": "2024-12-02 07:17:28",
|
6 |
"benchmark": "VisualWebArena",
|
7 |
"score": 26.7,
|
8 |
"std_err": 1.5,
|
results/GenericAgent-GPT-4o/webarena.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WebArena",
|
7 |
"score": 31.4,
|
8 |
"std_err": 1.6,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "d2eed215-91bb-4603-b69c-8ef8f9d57f34",
|
5 |
+
"date_time": "2024-11-29 22:28:32",
|
6 |
"benchmark": "WebArena",
|
7 |
"score": 31.4,
|
8 |
"std_err": 1.6,
|
results/GenericAgent-GPT-4o/weblinx.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WebLINX",
|
7 |
"score": 12.5,
|
8 |
"std_err": 0.6,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
+
"date_time": "2024-11-07 21:42:30",
|
6 |
"benchmark": "WebLINX",
|
7 |
"score": 12.5,
|
8 |
"std_err": 0.6,
|
results/GenericAgent-GPT-4o/workarena-l1.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
"score": 45.5,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "2024-10-23_14-17-40",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
"score": 45.5,
|
results/GenericAgent-GPT-4o/workarena-l2.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 8.5,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "2024-10-23_17-10-46",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 8.5,
|
results/GenericAgent-GPT-4o/workarena-l3.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "-",
|
5 |
+
"date_time": "2024-10-24 23:03:30",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
results/GenericAgent-GPT-o1-mini/assistantbench.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "AssistantBench",
|
7 |
"score": 6.9,
|
8 |
"std_err": 2.2,
|
@@ -10,7 +10,7 @@
|
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
12 |
"reproducible": "Yes",
|
13 |
-
"comments": "
|
14 |
"original_or_reproduced": "Original"
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
+
"date_time": "2024-11-28 19:34:58",
|
6 |
"benchmark": "AssistantBench",
|
7 |
"score": 6.9,
|
8 |
"std_err": 2.2,
|
|
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
12 |
"reproducible": "Yes",
|
13 |
+
"comments": "Intersection of finished tasks across agents.",
|
14 |
"original_or_reproduced": "Original"
|
15 |
}
|
16 |
]
|
results/GenericAgent-GPT-o1-mini/miniwob.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "MiniWoB",
|
7 |
"score": 67.8,
|
8 |
"std_err": 1.9,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "2024-10-25_06-08-16",
|
5 |
+
"date_time": "2024-10-25 17:16:23",
|
6 |
"benchmark": "MiniWoB",
|
7 |
"score": 67.8,
|
8 |
"std_err": 1.9,
|
results/GenericAgent-GPT-o1-mini/webarena.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WebArena",
|
7 |
"score": 28.6,
|
8 |
"std_err": 1.6,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "1827983d-5e84-4b63-ad49-bf45ec2a6348",
|
5 |
+
"date_time": "2024-11-30 00:22:44",
|
6 |
"benchmark": "WebArena",
|
7 |
"score": 28.6,
|
8 |
"std_err": 1.6,
|
results/GenericAgent-GPT-o1-mini/weblinx.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WebLINX",
|
7 |
"score": 12.5,
|
8 |
"std_err": 0.6,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
+
"date_time": "2024-11-07 21:42:30",
|
6 |
"benchmark": "WebLINX",
|
7 |
"score": 12.5,
|
8 |
"std_err": 0.6,
|
results/GenericAgent-GPT-o1-mini/workarena-l1.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
"score": 56.7,
|
8 |
"std_err": 2.7,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "2024-10-23_14-17-40",
|
5 |
+
"date_time": "2024-10-23 22:30:06",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
"score": 56.7,
|
8 |
"std_err": 2.7,
|
results/GenericAgent-GPT-o1-mini/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 14.9,
|
8 |
"std_err": 2.3,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "2024-10-23_17-10-46",
|
5 |
+
"date_time": "2024-10-24 17:08:53",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 14.9,
|
8 |
"std_err": 2.3,
|
results/GenericAgent-GPT-o1-mini/workarena-l3.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-GPT-o1-mini",
|
4 |
+
"study_id": "-",
|
5 |
+
"date_time": "2024-10-24 23:03:30",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
results/GenericAgent-Llama-3.1-405b/assistantbench.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 3.9,
|
7 |
"std_err": 1.0,
|
@@ -9,8 +9,8 @@
|
|
9 |
"benchmark_tuned": "No",
|
10 |
"followed_evaluation_protocol": "Yes",
|
11 |
"reproducible": "Yes",
|
12 |
-
"comments": "
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 3.9,
|
7 |
"std_err": 1.0,
|
|
|
9 |
"benchmark_tuned": "No",
|
10 |
"followed_evaluation_protocol": "Yes",
|
11 |
"reproducible": "Yes",
|
12 |
+
"comments": "Intersection of finished tasks across agents.",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-28 19:34:58"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-405b/miniwob.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 64.6,
|
7 |
"std_err": 1.9,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "4d748972-6d35-4489-a197-138b656a7db3",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 64.6,
|
7 |
"std_err": 1.9,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-29 16:14:00"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-405b/webarena.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 24.0,
|
7 |
"std_err": 1.5,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "aaeca13d-0cf5-444f-8445-590350b54746",
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 24.0,
|
7 |
"std_err": 1.5,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-12-01 00:04:43"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-405b/weblinx.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 7.9,
|
7 |
"std_err": 0.5,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 7.9,
|
7 |
"std_err": 0.5,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-07 21:42:30"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-405b/workarena-l1.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 43.3,
|
7 |
"std_err": 2.7,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "2024-10-25_17-34-45",
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 43.3,
|
7 |
"std_err": 2.7,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-10-25 20:32:26"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-405b/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 7.2,
|
8 |
"std_err": 1.7,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "528da1f2-1949-41dc-b988-85f19f435af2",
|
5 |
+
"date_time": "2024-11-29 14:28:47",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 7.2,
|
8 |
"std_err": 1.7,
|
results/GenericAgent-Llama-3.1-405b/workarena-l3.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WorkArena-L3",
|
6 |
"score": 0.0,
|
7 |
"std_err": 0.0,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-405b",
|
4 |
+
"study_id": "-",
|
5 |
"benchmark": "WorkArena-L3",
|
6 |
"score": 0.0,
|
7 |
"std_err": 0.0,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-10-24 23:03:30"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-70b/assistantbench.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 2.8,
|
7 |
"std_err": 1.1,
|
@@ -9,8 +9,8 @@
|
|
9 |
"benchmark_tuned": "No",
|
10 |
"followed_evaluation_protocol": "Yes",
|
11 |
"reproducible": "Yes",
|
12 |
-
"comments": "
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 2.8,
|
7 |
"std_err": 1.1,
|
|
|
9 |
"benchmark_tuned": "No",
|
10 |
"followed_evaluation_protocol": "Yes",
|
11 |
"reproducible": "Yes",
|
12 |
+
"comments": "Intersection of finished tasks across agents.",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-28 19:34:58"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-70b/miniwob.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 57.6,
|
7 |
"std_err": 2.0,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "2024-10-25_06-08-16",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 57.6,
|
7 |
"std_err": 2.0,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-10-25 17:16:23"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-70b/webarena.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 18.4,
|
7 |
"std_err": 1.4,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "fc5747bc-d998-4942-a0eb-e55a3ccc1cb3",
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 18.4,
|
7 |
"std_err": 1.4,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-12-02 23:18:38"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-70b/weblinx.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 8.9,
|
7 |
"std_err": 0.5,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 8.9,
|
7 |
"std_err": 0.5,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-11-07 21:42:30"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-70b/workarena-l1.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 27.9,
|
7 |
"std_err": 2.5,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "2024-10-23_14-17-40",
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 27.9,
|
7 |
"std_err": 2.5,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-10-23 22:30:06"
|
15 |
}
|
16 |
]
|
results/GenericAgent-Llama-3.1-70b/workarena-l2.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 2.1,
|
8 |
"std_err": 0.9,
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "2024-10-23_17-10-46",
|
5 |
+
"date_time": "2024-10-24 17:08:53",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
"score": 2.1,
|
8 |
"std_err": 0.9,
|
results/GenericAgent-Llama-3.1-70b/workarena-l3.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
-
"study_id": "
|
5 |
"benchmark": "WorkArena-L3",
|
6 |
"score": 0.0,
|
7 |
"std_err": 0.0,
|
@@ -11,6 +11,6 @@
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "
|
15 |
}
|
16 |
]
|
|
|
1 |
[
|
2 |
{
|
3 |
"agent_name": "GenericAgent-Llama-3.1-70b",
|
4 |
+
"study_id": "-",
|
5 |
"benchmark": "WorkArena-L3",
|
6 |
"score": 0.0,
|
7 |
"std_err": 0.0,
|
|
|
11 |
"reproducible": "Yes",
|
12 |
"comments": "NA",
|
13 |
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2024-10-24 23:03:30"
|
15 |
}
|
16 |
]
|