meghsn commited on
Commit
52facf3
·
1 Parent(s): 90d6776

Added reproducibility journal metadata

Browse files
Files changed (45) hide show
  1. results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +3 -3
  2. results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +1 -1
  3. results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +2 -2
  4. results/GenericAgent-Claude-3.5-Sonnet/webarena.json +2 -2
  5. results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +2 -2
  6. results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +1 -1
  7. results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json +1 -1
  8. results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json +1 -1
  9. results/GenericAgent-GPT-4o-mini/assistantbench.json +3 -3
  10. results/GenericAgent-GPT-4o-mini/miniwob.json +1 -1
  11. results/GenericAgent-GPT-4o-mini/visualwebarena.json +2 -2
  12. results/GenericAgent-GPT-4o-mini/webarena.json +2 -2
  13. results/GenericAgent-GPT-4o-mini/weblinx.json +2 -2
  14. results/GenericAgent-GPT-4o-mini/workarena-l1.json +1 -1
  15. results/GenericAgent-GPT-4o-mini/workarena-l2.json +1 -1
  16. results/GenericAgent-GPT-4o-mini/workarena-l3.json +2 -2
  17. results/GenericAgent-GPT-4o/assistantbench.json +3 -3
  18. results/GenericAgent-GPT-4o/miniwob.json +1 -1
  19. results/GenericAgent-GPT-4o/visualwebarena.json +2 -2
  20. results/GenericAgent-GPT-4o/webarena.json +2 -2
  21. results/GenericAgent-GPT-4o/weblinx.json +2 -2
  22. results/GenericAgent-GPT-4o/workarena-l1.json +1 -1
  23. results/GenericAgent-GPT-4o/workarena-l2.json +1 -1
  24. results/GenericAgent-GPT-4o/workarena-l3.json +2 -2
  25. results/GenericAgent-GPT-o1-mini/assistantbench.json +3 -3
  26. results/GenericAgent-GPT-o1-mini/miniwob.json +2 -2
  27. results/GenericAgent-GPT-o1-mini/webarena.json +2 -2
  28. results/GenericAgent-GPT-o1-mini/weblinx.json +2 -2
  29. results/GenericAgent-GPT-o1-mini/workarena-l1.json +2 -2
  30. results/GenericAgent-GPT-o1-mini/workarena-l2.json +2 -2
  31. results/GenericAgent-GPT-o1-mini/workarena-l3.json +2 -2
  32. results/GenericAgent-Llama-3.1-405b/assistantbench.json +3 -3
  33. results/GenericAgent-Llama-3.1-405b/miniwob.json +2 -2
  34. results/GenericAgent-Llama-3.1-405b/webarena.json +2 -2
  35. results/GenericAgent-Llama-3.1-405b/weblinx.json +2 -2
  36. results/GenericAgent-Llama-3.1-405b/workarena-l1.json +2 -2
  37. results/GenericAgent-Llama-3.1-405b/workarena-l2.json +2 -2
  38. results/GenericAgent-Llama-3.1-405b/workarena-l3.json +2 -2
  39. results/GenericAgent-Llama-3.1-70b/assistantbench.json +3 -3
  40. results/GenericAgent-Llama-3.1-70b/miniwob.json +2 -2
  41. results/GenericAgent-Llama-3.1-70b/webarena.json +2 -2
  42. results/GenericAgent-Llama-3.1-70b/weblinx.json +2 -2
  43. results/GenericAgent-Llama-3.1-70b/workarena-l1.json +2 -2
  44. results/GenericAgent-Llama-3.1-70b/workarena-l2.json +2 -2
  45. results/GenericAgent-Llama-3.1-70b/workarena-l3.json +2 -2
results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "benchmark": "AssistantBench",
6
  "score": 5.2,
7
  "std_err": 1.5,
@@ -9,8 +9,8 @@
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
11
  "reproducible": "Yes",
12
- "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
  "benchmark": "AssistantBench",
6
  "score": 5.2,
7
  "std_err": 1.5,
 
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
11
  "reproducible": "Yes",
12
+ "comments": "Intersection of finished tasks across agents.",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-28 19:34:58"
15
  }
16
  ]
results/GenericAgent-Claude-3.5-Sonnet/miniwob.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "benchmark": "MiniWoB",
6
  "score": 69.8,
7
  "std_err": 1.8,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "2024-10-25_06-08-16",
5
  "benchmark": "MiniWoB",
6
  "score": 69.8,
7
  "std_err": 1.8,
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "benchmark": "VisualWebArena",
6
  "score": 21.0,
7
  "std_err": 1.3,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
5
  "benchmark": "VisualWebArena",
6
  "score": 21.0,
7
  "std_err": 1.3,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-12-02 09:11:35"
15
  }
16
  ]
results/GenericAgent-Claude-3.5-Sonnet/webarena.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "benchmark": "WebArena",
6
  "score": 36.2,
7
  "std_err": 1.7,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
5
  "benchmark": "WebArena",
6
  "score": 36.2,
7
  "std_err": 1.7,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-29 22:37:46"
15
  }
16
  ]
results/GenericAgent-Claude-3.5-Sonnet/weblinx.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "benchmark": "WebLINX",
6
  "score": 13.7,
7
  "std_err": 0.6,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
  "benchmark": "WebLINX",
6
  "score": 13.7,
7
  "std_err": 0.6,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-07 21:42:30"
15
  }
16
  ]
results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "benchmark": "WorkArena-L1",
6
  "score": 56.4,
7
  "std_err": 2.7,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "2024-10-23_14-17-40",
5
  "benchmark": "WorkArena-L1",
6
  "score": 56.4,
7
  "std_err": 2.7,
results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 39.1,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "2024-10-23_17-10-46",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 39.1,
results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.4,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "2024-10-24_18-06-57",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.4,
results/GenericAgent-GPT-4o-mini/assistantbench.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
7
  "score": 2.1,
8
  "std_err": 1.0,
@@ -10,7 +10,7 @@
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
12
  "reproducible": "Yes",
13
- "comments": "NA",
14
  "original_or_reproduced": "Original"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
+ "date_time": "2024-11-28 19:34:58",
6
  "benchmark": "AssistantBench",
7
  "score": 2.1,
8
  "std_err": 1.0,
 
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
12
  "reproducible": "Yes",
13
+ "comments": "Intersection of finished tasks across agents.",
14
  "original_or_reproduced": "Original"
15
  }
16
  ]
results/GenericAgent-GPT-4o-mini/miniwob.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
  "score": 56.6,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "2024-10-25_06-08-16",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
  "score": 56.6,
results/GenericAgent-GPT-4o-mini/visualwebarena.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "VisualWebArena",
7
  "score": 16.9,
8
  "std_err": 1.2,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4",
5
+ "date_time": "2024-12-02 02:54:33",
6
  "benchmark": "VisualWebArena",
7
  "score": 16.9,
8
  "std_err": 1.2,
results/GenericAgent-GPT-4o-mini/webarena.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
7
  "score": 17.4,
8
  "std_err": 1.3,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "c6bdeb87-9879-4c06-aa70-00d895001156",
5
+ "date_time": "2024-11-29 19:25:49",
6
  "benchmark": "WebArena",
7
  "score": 17.4,
8
  "std_err": 1.3,
results/GenericAgent-GPT-4o-mini/weblinx.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
7
  "score": 11.6,
8
  "std_err": 0.6,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
+ "date_time": "2024-11-07 21:42:30",
6
  "benchmark": "WebLINX",
7
  "score": 11.6,
8
  "std_err": 0.6,
results/GenericAgent-GPT-4o-mini/workarena-l1.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
  "score": 27,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "2024-10-23_14-17-40",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
  "score": 27,
results/GenericAgent-GPT-4o-mini/workarena-l2.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 1.3,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "2024-10-23_17-10-46",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 1.3,
results/GenericAgent-GPT-4o-mini/workarena-l3.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "-",
5
+ "date_time": "2024-10-24 23:03:30",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
results/GenericAgent-GPT-4o/assistantbench.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
7
  "score": 4.8,
8
  "std_err": 2.4,
@@ -10,7 +10,7 @@
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
12
  "reproducible": "Yes",
13
- "comments": "NA",
14
  "original_or_reproduced": "Original"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
+ "date_time": "2024-11-28 19:34:58",
6
  "benchmark": "AssistantBench",
7
  "score": 4.8,
8
  "std_err": 2.4,
 
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
12
  "reproducible": "Yes",
13
+ "comments": "Intersection of finished tasks across agents.",
14
  "original_or_reproduced": "Original"
15
  }
16
  ]
results/GenericAgent-GPT-4o/miniwob.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
  "score": 63.8,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "2024-10-25_06-08-16",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
  "score": 63.8,
results/GenericAgent-GPT-4o/visualwebarena.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "VisualWebArena",
7
  "score": 26.7,
8
  "std_err": 1.5,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "7fb7eac8-4bbd-4ebe-be32-15901a7678f2",
5
+ "date_time": "2024-12-02 07:17:28",
6
  "benchmark": "VisualWebArena",
7
  "score": 26.7,
8
  "std_err": 1.5,
results/GenericAgent-GPT-4o/webarena.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
7
  "score": 31.4,
8
  "std_err": 1.6,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "d2eed215-91bb-4603-b69c-8ef8f9d57f34",
5
+ "date_time": "2024-11-29 22:28:32",
6
  "benchmark": "WebArena",
7
  "score": 31.4,
8
  "std_err": 1.6,
results/GenericAgent-GPT-4o/weblinx.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
7
  "score": 12.5,
8
  "std_err": 0.6,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
+ "date_time": "2024-11-07 21:42:30",
6
  "benchmark": "WebLINX",
7
  "score": 12.5,
8
  "std_err": 0.6,
results/GenericAgent-GPT-4o/workarena-l1.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
  "score": 45.5,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "2024-10-23_14-17-40",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
  "score": 45.5,
results/GenericAgent-GPT-4o/workarena-l2.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 8.5,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "2024-10-23_17-10-46",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 8.5,
results/GenericAgent-GPT-4o/workarena-l3.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "-",
5
+ "date_time": "2024-10-24 23:03:30",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
results/GenericAgent-GPT-o1-mini/assistantbench.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
7
  "score": 6.9,
8
  "std_err": 2.2,
@@ -10,7 +10,7 @@
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
12
  "reproducible": "Yes",
13
- "comments": "NA",
14
  "original_or_reproduced": "Original"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
+ "date_time": "2024-11-28 19:34:58",
6
  "benchmark": "AssistantBench",
7
  "score": 6.9,
8
  "std_err": 2.2,
 
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
12
  "reproducible": "Yes",
13
+ "comments": "Intersection of finished tasks across agents.",
14
  "original_or_reproduced": "Original"
15
  }
16
  ]
results/GenericAgent-GPT-o1-mini/miniwob.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
  "score": 67.8,
8
  "std_err": 1.9,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "2024-10-25_06-08-16",
5
+ "date_time": "2024-10-25 17:16:23",
6
  "benchmark": "MiniWoB",
7
  "score": 67.8,
8
  "std_err": 1.9,
results/GenericAgent-GPT-o1-mini/webarena.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
7
  "score": 28.6,
8
  "std_err": 1.6,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "1827983d-5e84-4b63-ad49-bf45ec2a6348",
5
+ "date_time": "2024-11-30 00:22:44",
6
  "benchmark": "WebArena",
7
  "score": 28.6,
8
  "std_err": 1.6,
results/GenericAgent-GPT-o1-mini/weblinx.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
7
  "score": 12.5,
8
  "std_err": 0.6,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
+ "date_time": "2024-11-07 21:42:30",
6
  "benchmark": "WebLINX",
7
  "score": 12.5,
8
  "std_err": 0.6,
results/GenericAgent-GPT-o1-mini/workarena-l1.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
  "score": 56.7,
8
  "std_err": 2.7,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "2024-10-23_14-17-40",
5
+ "date_time": "2024-10-23 22:30:06",
6
  "benchmark": "WorkArena-L1",
7
  "score": 56.7,
8
  "std_err": 2.7,
results/GenericAgent-GPT-o1-mini/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 14.9,
8
  "std_err": 2.3,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "2024-10-23_17-10-46",
5
+ "date_time": "2024-10-24 17:08:53",
6
  "benchmark": "WorkArena-L2",
7
  "score": 14.9,
8
  "std_err": 2.3,
results/GenericAgent-GPT-o1-mini/workarena-l3.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-GPT-o1-mini",
4
+ "study_id": "-",
5
+ "date_time": "2024-10-24 23:03:30",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
results/GenericAgent-Llama-3.1-405b/assistantbench.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
  "benchmark": "AssistantBench",
6
  "score": 3.9,
7
  "std_err": 1.0,
@@ -9,8 +9,8 @@
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
11
  "reproducible": "Yes",
12
- "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
  "benchmark": "AssistantBench",
6
  "score": 3.9,
7
  "std_err": 1.0,
 
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
11
  "reproducible": "Yes",
12
+ "comments": "Intersection of finished tasks across agents.",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-28 19:34:58"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-405b/miniwob.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
  "benchmark": "MiniWoB",
6
  "score": 64.6,
7
  "std_err": 1.9,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "4d748972-6d35-4489-a197-138b656a7db3",
5
  "benchmark": "MiniWoB",
6
  "score": 64.6,
7
  "std_err": 1.9,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-29 16:14:00"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-405b/webarena.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
  "benchmark": "WebArena",
6
  "score": 24.0,
7
  "std_err": 1.5,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "aaeca13d-0cf5-444f-8445-590350b54746",
5
  "benchmark": "WebArena",
6
  "score": 24.0,
7
  "std_err": 1.5,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-12-01 00:04:43"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-405b/weblinx.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
  "benchmark": "WebLINX",
6
  "score": 7.9,
7
  "std_err": 0.5,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
  "benchmark": "WebLINX",
6
  "score": 7.9,
7
  "std_err": 0.5,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-07 21:42:30"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-405b/workarena-l1.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
  "benchmark": "WorkArena-L1",
6
  "score": 43.3,
7
  "std_err": 2.7,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "2024-10-25_17-34-45",
5
  "benchmark": "WorkArena-L1",
6
  "score": 43.3,
7
  "std_err": 2.7,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-10-25 20:32:26"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-405b/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 7.2,
8
  "std_err": 1.7,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "528da1f2-1949-41dc-b988-85f19f435af2",
5
+ "date_time": "2024-11-29 14:28:47",
6
  "benchmark": "WorkArena-L2",
7
  "score": 7.2,
8
  "std_err": 1.7,
results/GenericAgent-Llama-3.1-405b/workarena-l3.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
- "study_id": "study_id",
5
  "benchmark": "WorkArena-L3",
6
  "score": 0.0,
7
  "std_err": 0.0,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-405b",
4
+ "study_id": "-",
5
  "benchmark": "WorkArena-L3",
6
  "score": 0.0,
7
  "std_err": 0.0,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-10-24 23:03:30"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-70b/assistantbench.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
  "benchmark": "AssistantBench",
6
  "score": 2.8,
7
  "std_err": 1.1,
@@ -9,8 +9,8 @@
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
11
  "reproducible": "Yes",
12
- "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
  "benchmark": "AssistantBench",
6
  "score": 2.8,
7
  "std_err": 1.1,
 
9
  "benchmark_tuned": "No",
10
  "followed_evaluation_protocol": "Yes",
11
  "reproducible": "Yes",
12
+ "comments": "Intersection of finished tasks across agents.",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-28 19:34:58"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-70b/miniwob.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
  "benchmark": "MiniWoB",
6
  "score": 57.6,
7
  "std_err": 2.0,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "2024-10-25_06-08-16",
5
  "benchmark": "MiniWoB",
6
  "score": 57.6,
7
  "std_err": 2.0,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-10-25 17:16:23"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-70b/webarena.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
  "benchmark": "WebArena",
6
  "score": 18.4,
7
  "std_err": 1.4,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "fc5747bc-d998-4942-a0eb-e55a3ccc1cb3",
5
  "benchmark": "WebArena",
6
  "score": 18.4,
7
  "std_err": 1.4,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-12-02 23:18:38"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-70b/weblinx.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
  "benchmark": "WebLINX",
6
  "score": 8.9,
7
  "std_err": 0.5,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
  "benchmark": "WebLINX",
6
  "score": 8.9,
7
  "std_err": 0.5,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-11-07 21:42:30"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-70b/workarena-l1.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
  "benchmark": "WorkArena-L1",
6
  "score": 27.9,
7
  "std_err": 2.5,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "2024-10-23_14-17-40",
5
  "benchmark": "WorkArena-L1",
6
  "score": 27.9,
7
  "std_err": 2.5,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-10-23 22:30:06"
15
  }
16
  ]
results/GenericAgent-Llama-3.1-70b/workarena-l2.json CHANGED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
  "score": 2.1,
8
  "std_err": 0.9,
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "2024-10-23_17-10-46",
5
+ "date_time": "2024-10-24 17:08:53",
6
  "benchmark": "WorkArena-L2",
7
  "score": 2.1,
8
  "std_err": 0.9,
results/GenericAgent-Llama-3.1-70b/workarena-l3.json CHANGED
@@ -1,7 +1,7 @@
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
- "study_id": "study_id",
5
  "benchmark": "WorkArena-L3",
6
  "score": 0.0,
7
  "std_err": 0.0,
@@ -11,6 +11,6 @@
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
  }
16
  ]
 
1
  [
2
  {
3
  "agent_name": "GenericAgent-Llama-3.1-70b",
4
+ "study_id": "-",
5
  "benchmark": "WorkArena-L3",
6
  "score": 0.0,
7
  "std_err": 0.0,
 
11
  "reproducible": "Yes",
12
  "comments": "NA",
13
  "original_or_reproduced": "Original",
14
+ "date_time": "2024-10-24 23:03:30"
15
  }
16
  ]