Test agent results
Browse files
results/test-agent/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
### Test agent
|
results/test-agent/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "test-agent",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "MiniWoB",
|
7 |
+
"score": 43.4,
|
8 |
+
"std_err": 0.1,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/test-agent/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "test-agent",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 6.7,
|
8 |
+
"std_err": 0.2,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/test-agent/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "test-agent",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 6.1,
|
8 |
+
"std_err": 0.3,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/test-agent/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "test-agent",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/test-agent/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "test-agent",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L3",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|