diff --git a/app.py b/app.py index 5b0779ff902a83f865b7e2b51cb5d83b571430c7..f7285bbdee85a3d02161ddea0d34bc72024c4ac8 100644 --- a/app.py +++ b/app.py @@ -155,8 +155,9 @@ def create_html_table_benchmark(df, benchmark): html += '
{sanitize_column_name(column)} | ' + if column == "Reproduced_all" or column == "std_err": + continue + html += f'{sanitize_column_name(column)} | ' html += '{summary}{details} | '
- elif column == "Reproduced_all":
+ elif column == "Reproduced_all" or column == "std_err":
continue
elif column == "Score":
score_with_std_err = f'{row[column]} ± {row["std_err"]}'
diff --git a/results/Bgym-GPT-3.5/README.md b/results/Bgym-GPT-3.5/README.md
deleted file mode 100644
index f15589f8889e87445e98f51746ac426bcb81b9c0..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-3.5/README.md
+++ /dev/null
@@ -1 +0,0 @@
-## GPT-3.5 model
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/config.json b/results/Bgym-GPT-3.5/config.json
deleted file mode 100644
index 3ea3825d4f9cc8568d9cdfb93eac5fcb48a07f86..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-3.5/config.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
- "agent_name": "GPT-3.5",
- "backend_llm": "GPT-3.5"
-}
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/miniwob.json b/results/Bgym-GPT-3.5/miniwob.json
deleted file mode 100644
index b4d117fa36474116ffc4e8392cd375ccfeb52e6d..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-3.5/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-3.5",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "MiniWoB",
- "score": 43.4,
- "std_err": 0.1,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/webarena.json b/results/Bgym-GPT-3.5/webarena.json
deleted file mode 100644
index 7b352122f2d3056156d07f11f086e85753b090e4..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-3.5/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-3.5",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WebArena",
- "score": 6.7,
- "std_err": 0.2,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/workarena-l1.json b/results/Bgym-GPT-3.5/workarena-l1.json
deleted file mode 100644
index f163390b64e07051d8eddcfb1ddc4531a8f6af66..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-3.5/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-3.5",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L1",
- "score": 6.1,
- "std_err": 0.3,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-3.5/workarena-l2.json b/results/Bgym-GPT-3.5/workarena-l2.json
deleted file mode 100644
index ad6ab82a380e20dd6ffad919d2d9860703bce2ee..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-3.5/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-3.5",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L2",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/README.md b/results/Bgym-GPT-4o-V/README.md
deleted file mode 100644
index 065c2f2bbfe5c0845debe1baa0a086f2dd2c019a..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-4o-V/README.md
+++ /dev/null
@@ -1 +0,0 @@
-## GPT-4o-V model
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/miniwob.json b/results/Bgym-GPT-4o-V/miniwob.json
deleted file mode 100644
index 1090c29ca8017fddd7eb43d7de424c4ef5115f7c..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-4o-V/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-4o-V",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "MiniWoB",
- "score": 72.5,
- "std_err": 0.5,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/webarena.json b/results/Bgym-GPT-4o-V/webarena.json
deleted file mode 100644
index 4908982e7d053542eeaaf8f2410aa794dc05d52b..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-4o-V/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-4o-V",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WebArena",
- "score": 24.0,
- "std_err": 0.4,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/workarena-l1.json b/results/Bgym-GPT-4o-V/workarena-l1.json
deleted file mode 100644
index ed6776d3ca134d76b4e69886ba372792b5ea212a..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-4o-V/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-4o-V",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L1",
- "score": 41.8,
- "std_err": 0.4,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/workarena-l2.json b/results/Bgym-GPT-4o-V/workarena-l2.json
deleted file mode 100644
index 25e2c312fd03d6b61211943add71430d7bbf1003..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-4o-V/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-4o-V",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L2",
- "score": 3.8,
- "std_err": 0.6,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-4o-V/workarena-l3.json b/results/Bgym-GPT-4o-V/workarena-l3.json
deleted file mode 100644
index e9b990349435d7c131cee59d0dc559d4dafbd377..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-4o-V/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-4o-V",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L3",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-GPT-o1-mini/workarena-l3.json b/results/Bgym-GPT-o1-mini/workarena-l3.json
deleted file mode 100644
index 27de64528f60146373d975b03208772685c135c8..0000000000000000000000000000000000000000
--- a/results/Bgym-GPT-o1-mini/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-GPT-o1-mini",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L3",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/README.md b/results/Bgym-Llama-3-70b/README.md
deleted file mode 100644
index 8798ff4c72825c5049ef24cf16b818faa9ae5d2b..0000000000000000000000000000000000000000
--- a/results/Bgym-Llama-3-70b/README.md
+++ /dev/null
@@ -1 +0,0 @@
-### Llama-3-70B
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/miniwob.json b/results/Bgym-Llama-3-70b/miniwob.json
deleted file mode 100644
index 5dadad99d077784d8a2c662c6262722c34e834f8..0000000000000000000000000000000000000000
--- a/results/Bgym-Llama-3-70b/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Llama-3-70b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "MiniWoB",
- "score": 68.2,
- "std_err": 0.7,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/webarena.json b/results/Bgym-Llama-3-70b/webarena.json
deleted file mode 100644
index 6c229ed5d2d97ac3623759b8f7f7131a3830abea..0000000000000000000000000000000000000000
--- a/results/Bgym-Llama-3-70b/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Llama-3-70b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WebArena",
- "score": 11.0,
- "std_err": 0.3,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/workarena-l1.json b/results/Bgym-Llama-3-70b/workarena-l1.json
deleted file mode 100644
index 4ffaba752e5d843fe7d02089c499881dd3e321e6..0000000000000000000000000000000000000000
--- a/results/Bgym-Llama-3-70b/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Llama-3-70b",
- "study_id": "study_id",
- "benchmark": "WorkArena-L1",
- "score": 17.9,
- "std_err": 0.6,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original",
- "date_time": "2021-01-01 12:00:00"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/workarena-l2.json b/results/Bgym-Llama-3-70b/workarena-l2.json
deleted file mode 100644
index 0f0f8451f47f7022fe3b85923f9356c2f1617c15..0000000000000000000000000000000000000000
--- a/results/Bgym-Llama-3-70b/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Llama-3-70b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L2",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Llama-3-70b/workarena-l3.json b/results/Bgym-Llama-3-70b/workarena-l3.json
deleted file mode 100644
index acf0a81f58b3ab0e03178ad4f7ae27cb9fec4e97..0000000000000000000000000000000000000000
--- a/results/Bgym-Llama-3-70b/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Llama-3-70b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L3",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/README.md b/results/Bgym-Mixtral-8x22b/README.md
deleted file mode 100644
index 25b17de698790810b5a434228a08691aa048e4ca..0000000000000000000000000000000000000000
--- a/results/Bgym-Mixtral-8x22b/README.md
+++ /dev/null
@@ -1 +0,0 @@
-## Mixtral 8x22B
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/miniwob.json b/results/Bgym-Mixtral-8x22b/miniwob.json
deleted file mode 100644
index 0b6ea125b66d8032f19d3922284cadcfe7e5b957..0000000000000000000000000000000000000000
--- a/results/Bgym-Mixtral-8x22b/miniwob.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Mixtral-8x22b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "MiniWoB",
- "score": 62.4,
- "std_err": 0.5,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/webarena.json b/results/Bgym-Mixtral-8x22b/webarena.json
deleted file mode 100644
index 823344e109e805942efd8e34caa90d2e4a0c4d33..0000000000000000000000000000000000000000
--- a/results/Bgym-Mixtral-8x22b/webarena.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Mixtral-8x22b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WebArena",
- "score": 12.6,
- "std_err": 0.9,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/workarena-l1.json b/results/Bgym-Mixtral-8x22b/workarena-l1.json
deleted file mode 100644
index 80661f71df672452d7ae4b1cf8af6edfcdd75f94..0000000000000000000000000000000000000000
--- a/results/Bgym-Mixtral-8x22b/workarena-l1.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Mixtral-8x22b",
- "study_id": "study_id",
- "benchmark": "WorkArena-L1",
- "score": 12.4,
- "std_err": 0.7,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original",
- "date_time": "2021-01-04 12:06:00"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/workarena-l2.json b/results/Bgym-Mixtral-8x22b/workarena-l2.json
deleted file mode 100644
index fbc2324755d166804d3586bf0f54766c3c940232..0000000000000000000000000000000000000000
--- a/results/Bgym-Mixtral-8x22b/workarena-l2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Mixtral-8x22b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L2",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Mixtral-8x22b/workarena-l3.json b/results/Bgym-Mixtral-8x22b/workarena-l3.json
deleted file mode 100644
index 2cfe04fa51a97f69d7172f239484939e0716102a..0000000000000000000000000000000000000000
--- a/results/Bgym-Mixtral-8x22b/workarena-l3.json
+++ /dev/null
@@ -1,16 +0,0 @@
-[
- {
- "agent_name": "Bgym-Mixtral-8x22b",
- "study_id": "study_id",
- "date_time": "2021-01-01 12:00:00",
- "benchmark": "WorkArena-L3",
- "score": 0.0,
- "std_err": 0.0,
- "benchmark_specific": "No",
- "benchmark_tuned": "No",
- "followed_evaluation_protocol": "Yes",
- "reproducible": "Yes",
- "comments": "NA",
- "original_or_reproduced": "Original"
- }
-]
\ No newline at end of file
diff --git a/results/Bgym-Claude-3.5-Sonnet/README.md b/results/GenericAgent-Claude-3.5-Sonnet/README.md
similarity index 100%
rename from results/Bgym-Claude-3.5-Sonnet/README.md
rename to results/GenericAgent-Claude-3.5-Sonnet/README.md
diff --git a/results/Bgym-Claude-3.5-Sonnet/assistantbench.json b/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/assistantbench.json
rename to results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
index c4dc58a834bd8d454e15cf824fe700da52d1a774..2817e97e8381f7b5e7f0feea80ced0481f14740b 100644
--- a/results/Bgym-Claude-3.5-Sonnet/assistantbench.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"benchmark": "AssistantBench",
"score": 5.2,
diff --git a/results/Bgym-Claude-3.5-Sonnet/miniwob.json b/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/miniwob.json
rename to results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
index 7ac843b450c25e9079820e355024542a714f8672..d123c6ace4239ce3ad050deec84cbf066df393e5 100644
--- a/results/Bgym-Claude-3.5-Sonnet/miniwob.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"benchmark": "MiniWoB",
"score": 69.8,
diff --git a/results/Bgym-Claude-3.5-Sonnet/webarena.json b/results/GenericAgent-Claude-3.5-Sonnet/webarena.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/webarena.json
rename to results/GenericAgent-Claude-3.5-Sonnet/webarena.json
index d00d09d6f46bf170b56d4ac765a0bad6cfc19a79..2986f01595d3e3989e51b25268e1a0e71f78582b 100644
--- a/results/Bgym-Claude-3.5-Sonnet/webarena.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/webarena.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"benchmark": "WebArena",
"score": 36.2,
diff --git a/results/Bgym-Claude-3.5-Sonnet/weblinx.json b/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/weblinx.json
rename to results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
index 6763e3b0f5d9d66b698b6bbf2e6921b4572a1eaa..e5aedb085b87a982ad7aef88dc8d640aad784fb8 100644
--- a/results/Bgym-Claude-3.5-Sonnet/weblinx.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"benchmark": "WebLINX",
"score": 13.7,
diff --git a/results/Bgym-Claude-3.5-Sonnet/workarena-l1.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/workarena-l1.json
rename to results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
index ed2c2c2097974bbc7c3845fd8915d6588db7698a..7cb49d00ba787ec9d1c4903755b367e241adc4f9 100644
--- a/results/Bgym-Claude-3.5-Sonnet/workarena-l1.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"benchmark": "WorkArena-L1",
"score": 56.4,
diff --git a/results/Bgym-Claude-3.5-Sonnet/workarena-l2.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/workarena-l2.json
rename to results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json
index c48c1af20788e1ceb58a90ea9e36bda55db6cfca..b66fb16b550a058b359fa90eb44364104437d7b2 100644
--- a/results/Bgym-Claude-3.5-Sonnet/workarena-l2.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L2",
diff --git a/results/Bgym-Claude-3.5-Sonnet/workarena-l3.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json
similarity index 87%
rename from results/Bgym-Claude-3.5-Sonnet/workarena-l3.json
rename to results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json
index 9f21ceedef7c1a025ae86e197785dea2f83a7e06..522438fcb0792c3ccdb50b8a19576bd70e988345 100644
--- a/results/Bgym-Claude-3.5-Sonnet/workarena-l3.json
+++ b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Claude-3.5-Sonnet",
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L3",
diff --git a/results/Bgym-GPT-4o-mini/README.md b/results/GenericAgent-GPT-4o-mini/README.md
similarity index 100%
rename from results/Bgym-GPT-4o-mini/README.md
rename to results/GenericAgent-GPT-4o-mini/README.md
diff --git a/results/Bgym-GPT-4o-mini/assistantbench.json b/results/GenericAgent-GPT-4o-mini/assistantbench.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/assistantbench.json
rename to results/GenericAgent-GPT-4o-mini/assistantbench.json
index 9428ea816ad49681f3cb4c993d0e68dedd3eea41..e29481fb2c87e364ac7fcd236caddee1b1a1d176 100644
--- a/results/Bgym-GPT-4o-mini/assistantbench.json
+++ b/results/GenericAgent-GPT-4o-mini/assistantbench.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "AssistantBench",
diff --git a/results/Bgym-GPT-4o-mini/miniwob.json b/results/GenericAgent-GPT-4o-mini/miniwob.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/miniwob.json
rename to results/GenericAgent-GPT-4o-mini/miniwob.json
index 75d99b96bcf7d72ea41c49022be9b294b897b79c..24262492859411f33f84701017a01c37bbd0a332 100644
--- a/results/Bgym-GPT-4o-mini/miniwob.json
+++ b/results/GenericAgent-GPT-4o-mini/miniwob.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "MiniWoB",
diff --git a/results/Bgym-GPT-4o-mini/webarena.json b/results/GenericAgent-GPT-4o-mini/webarena.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/webarena.json
rename to results/GenericAgent-GPT-4o-mini/webarena.json
index 1c59caff4a0c1f822f3ef2fb9698eb16331df56b..0bd123ecdbbfbeca4be421548b05328f8893aba6 100644
--- a/results/Bgym-GPT-4o-mini/webarena.json
+++ b/results/GenericAgent-GPT-4o-mini/webarena.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WebArena",
diff --git a/results/Bgym-GPT-4o-mini/weblinx.json b/results/GenericAgent-GPT-4o-mini/weblinx.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/weblinx.json
rename to results/GenericAgent-GPT-4o-mini/weblinx.json
index 356eb5d5c607ca8e12a05ec651d2d47a41144082..ec560a120d2a5e376c98497608705d7c503f4176 100644
--- a/results/Bgym-GPT-4o-mini/weblinx.json
+++ b/results/GenericAgent-GPT-4o-mini/weblinx.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WebLINX",
diff --git a/results/Bgym-GPT-4o-mini/workarena-l1.json b/results/GenericAgent-GPT-4o-mini/workarena-l1.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/workarena-l1.json
rename to results/GenericAgent-GPT-4o-mini/workarena-l1.json
index f4f444b46e6e6ea843e14ac799d1b510c98f66b5..ae2e1c6b0527cdda8fe18ab685c18081f504208c 100644
--- a/results/Bgym-GPT-4o-mini/workarena-l1.json
+++ b/results/GenericAgent-GPT-4o-mini/workarena-l1.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L1",
diff --git a/results/Bgym-GPT-4o-mini/workarena-l2.json b/results/GenericAgent-GPT-4o-mini/workarena-l2.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/workarena-l2.json
rename to results/GenericAgent-GPT-4o-mini/workarena-l2.json
index 1c80ee97e16dd3688262007ce69b3c7dfa98720f..6901a7ef5ec6a12e1c15ad50b98b926ee425a97b 100644
--- a/results/Bgym-GPT-4o-mini/workarena-l2.json
+++ b/results/GenericAgent-GPT-4o-mini/workarena-l2.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L2",
diff --git a/results/Bgym-GPT-4o-mini/workarena-l3.json b/results/GenericAgent-GPT-4o-mini/workarena-l3.json
similarity index 88%
rename from results/Bgym-GPT-4o-mini/workarena-l3.json
rename to results/GenericAgent-GPT-4o-mini/workarena-l3.json
index 27ef5630215cb85ece695d757831a94745ca11a0..29a97656776d58983de5dc16bd438b57ae5d9116 100644
--- a/results/Bgym-GPT-4o-mini/workarena-l3.json
+++ b/results/GenericAgent-GPT-4o-mini/workarena-l3.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o-mini",
+ "agent_name": "GenericAgent-GPT-4o-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L3",
diff --git a/results/Bgym-GPT-4o/README.md b/results/GenericAgent-GPT-4o/README.md
similarity index 100%
rename from results/Bgym-GPT-4o/README.md
rename to results/GenericAgent-GPT-4o/README.md
diff --git a/results/Bgym-GPT-4o/assistantbench.json b/results/GenericAgent-GPT-4o/assistantbench.json
similarity index 89%
rename from results/Bgym-GPT-4o/assistantbench.json
rename to results/GenericAgent-GPT-4o/assistantbench.json
index 7223e5586bdcb3d5f46987a6ffa5d33abdd17966..04c05cc788cc38f2b67f8dae5ace91d4fac187ee 100644
--- a/results/Bgym-GPT-4o/assistantbench.json
+++ b/results/GenericAgent-GPT-4o/assistantbench.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "AssistantBench",
diff --git a/results/Bgym-GPT-4o/miniwob.json b/results/GenericAgent-GPT-4o/miniwob.json
similarity index 89%
rename from results/Bgym-GPT-4o/miniwob.json
rename to results/GenericAgent-GPT-4o/miniwob.json
index a4dc858ac1c479280afd766c79918b3206ccef4c..717377ed01beed367ec089d296547f8d017f9fdb 100644
--- a/results/Bgym-GPT-4o/miniwob.json
+++ b/results/GenericAgent-GPT-4o/miniwob.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "MiniWoB",
diff --git a/results/Bgym-GPT-4o/webarena.json b/results/GenericAgent-GPT-4o/webarena.json
similarity index 89%
rename from results/Bgym-GPT-4o/webarena.json
rename to results/GenericAgent-GPT-4o/webarena.json
index 30684c7d9644bca29805a47bf38c43506ea4cf84..947aafbb445d03814411d4fb4daf0ff88cd3e10f 100644
--- a/results/Bgym-GPT-4o/webarena.json
+++ b/results/GenericAgent-GPT-4o/webarena.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WebArena",
diff --git a/results/Bgym-GPT-4o/weblinx.json b/results/GenericAgent-GPT-4o/weblinx.json
similarity index 89%
rename from results/Bgym-GPT-4o/weblinx.json
rename to results/GenericAgent-GPT-4o/weblinx.json
index d0eb1f5a22bc287a4132092b9bec567c9ad1e51c..0e6cc2cda023d01b9333bb6e5c5975d8841b87d7 100644
--- a/results/Bgym-GPT-4o/weblinx.json
+++ b/results/GenericAgent-GPT-4o/weblinx.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WebLINX",
diff --git a/results/Bgym-GPT-4o/workarena-l1.json b/results/GenericAgent-GPT-4o/workarena-l1.json
similarity index 89%
rename from results/Bgym-GPT-4o/workarena-l1.json
rename to results/GenericAgent-GPT-4o/workarena-l1.json
index 82191533728bfed194fbf7caf77779894000752e..597df5c7a7dc9c3c8ee38aa2ca90f76b27dd5e8c 100644
--- a/results/Bgym-GPT-4o/workarena-l1.json
+++ b/results/GenericAgent-GPT-4o/workarena-l1.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L1",
diff --git a/results/Bgym-GPT-4o/workarena-l2.json b/results/GenericAgent-GPT-4o/workarena-l2.json
similarity index 89%
rename from results/Bgym-GPT-4o/workarena-l2.json
rename to results/GenericAgent-GPT-4o/workarena-l2.json
index c6739539b0befba0de7bcf3905c5c9d542b97d3d..4bf9f5cf0928b5cf23380d03de268cea99e3f10e 100644
--- a/results/Bgym-GPT-4o/workarena-l2.json
+++ b/results/GenericAgent-GPT-4o/workarena-l2.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L2",
diff --git a/results/Bgym-GPT-4o/workarena-l3.json b/results/GenericAgent-GPT-4o/workarena-l3.json
similarity index 89%
rename from results/Bgym-GPT-4o/workarena-l3.json
rename to results/GenericAgent-GPT-4o/workarena-l3.json
index a5ec6780ea9ca807b4a8ef4d401ca14d33f7e3d6..bc56a8e64b483cc3c5163f57718af7711fc48840 100644
--- a/results/Bgym-GPT-4o/workarena-l3.json
+++ b/results/GenericAgent-GPT-4o/workarena-l3.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-4o",
+ "agent_name": "GenericAgent-GPT-4o",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L3",
diff --git a/results/Bgym-GPT-o1-mini/README.md b/results/GenericAgent-GPT-o1-mini/README.md
similarity index 100%
rename from results/Bgym-GPT-o1-mini/README.md
rename to results/GenericAgent-GPT-o1-mini/README.md
diff --git a/results/Bgym-GPT-o1-mini/assistantbench.json b/results/GenericAgent-GPT-o1-mini/assistantbench.json
similarity index 88%
rename from results/Bgym-GPT-o1-mini/assistantbench.json
rename to results/GenericAgent-GPT-o1-mini/assistantbench.json
index 307925dae4e75587aa979065062c87238dbb8a05..3b3c545c15ddd2f496be318466be01260b44e15d 100644
--- a/results/Bgym-GPT-o1-mini/assistantbench.json
+++ b/results/GenericAgent-GPT-o1-mini/assistantbench.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-o1-mini",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "AssistantBench",
diff --git a/results/Bgym-GPT-o1-mini/miniwob.json b/results/GenericAgent-GPT-o1-mini/miniwob.json
similarity index 88%
rename from results/Bgym-GPT-o1-mini/miniwob.json
rename to results/GenericAgent-GPT-o1-mini/miniwob.json
index 3483d9051f7751e8fc37713744ad21a74a27066c..46e6e4937b729d992687cdaf838ac67cc814df77 100644
--- a/results/Bgym-GPT-o1-mini/miniwob.json
+++ b/results/GenericAgent-GPT-o1-mini/miniwob.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-o1-mini",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "MiniWoB",
diff --git a/results/Bgym-GPT-o1-mini/webarena.json b/results/GenericAgent-GPT-o1-mini/webarena.json
similarity index 88%
rename from results/Bgym-GPT-o1-mini/webarena.json
rename to results/GenericAgent-GPT-o1-mini/webarena.json
index 369b02fec15d5e1f5a980f515693e77fc4ecd272..861c3cffe85efd49e9d62eb4d5093829ef52fe10 100644
--- a/results/Bgym-GPT-o1-mini/webarena.json
+++ b/results/GenericAgent-GPT-o1-mini/webarena.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-o1-mini",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WebArena",
diff --git a/results/Bgym-GPT-o1-mini/weblinx.json b/results/GenericAgent-GPT-o1-mini/weblinx.json
similarity index 88%
rename from results/Bgym-GPT-o1-mini/weblinx.json
rename to results/GenericAgent-GPT-o1-mini/weblinx.json
index 8ceeb6a6aab13421c37130e7c29cd958be2a6f44..cf0a5b827c987bd0870a6405bef252b2e0a7ac69 100644
--- a/results/Bgym-GPT-o1-mini/weblinx.json
+++ b/results/GenericAgent-GPT-o1-mini/weblinx.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-o1-mini",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WebLINX",
diff --git a/results/Bgym-GPT-o1-mini/workarena-l1.json b/results/GenericAgent-GPT-o1-mini/workarena-l1.json
similarity index 88%
rename from results/Bgym-GPT-o1-mini/workarena-l1.json
rename to results/GenericAgent-GPT-o1-mini/workarena-l1.json
index fac67e3ab9222bb24f82f2e60b7eedee0388b520..9bc93070836fa763371e27350d8a14c797ef9f6d 100644
--- a/results/Bgym-GPT-o1-mini/workarena-l1.json
+++ b/results/GenericAgent-GPT-o1-mini/workarena-l1.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-o1-mini",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L1",
diff --git a/results/Bgym-GPT-o1-mini/workarena-l2.json b/results/GenericAgent-GPT-o1-mini/workarena-l2.json
similarity index 88%
rename from results/Bgym-GPT-o1-mini/workarena-l2.json
rename to results/GenericAgent-GPT-o1-mini/workarena-l2.json
index c57f1d30c9455651f302d654ccedfb1ee30eabf4..292db2bf5f826ae09fd1d8043022c20100bc3a62 100644
--- a/results/Bgym-GPT-o1-mini/workarena-l2.json
+++ b/results/GenericAgent-GPT-o1-mini/workarena-l2.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-o1-mini",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L2",
diff --git a/results/Bgym-GPT-3.5/workarena-l3.json b/results/GenericAgent-GPT-o1-mini/workarena-l3.json
similarity index 88%
rename from results/Bgym-GPT-3.5/workarena-l3.json
rename to results/GenericAgent-GPT-o1-mini/workarena-l3.json
index 40093a485842f340d16d25af5768e8d066377a05..97a2b1e29833b771a00942646bc380f9b0b14172 100644
--- a/results/Bgym-GPT-3.5/workarena-l3.json
+++ b/results/GenericAgent-GPT-o1-mini/workarena-l3.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-GPT-3.5",
+ "agent_name": "GenericAgent-GPT-o1-mini",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L3",
diff --git a/results/Bgym-Llama-3.1-405b/README.md b/results/GenericAgent-Llama-3.1-405b/README.md
similarity index 100%
rename from results/Bgym-Llama-3.1-405b/README.md
rename to results/GenericAgent-Llama-3.1-405b/README.md
diff --git a/results/Bgym-Llama-3.1-405b/assistantbench.json b/results/GenericAgent-Llama-3.1-405b/assistantbench.json
similarity index 88%
rename from results/Bgym-Llama-3.1-405b/assistantbench.json
rename to results/GenericAgent-Llama-3.1-405b/assistantbench.json
index 7a6aef8a8b7f7a21b7387ba8b014b7d12e080c1f..6d42af475973c54772ef675d96074b0ec8b16bc5 100644
--- a/results/Bgym-Llama-3.1-405b/assistantbench.json
+++ b/results/GenericAgent-Llama-3.1-405b/assistantbench.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"benchmark": "AssistantBench",
"score": 3.9,
diff --git a/results/Bgym-Llama-3.1-405b/miniwob.json b/results/GenericAgent-Llama-3.1-405b/miniwob.json
similarity index 87%
rename from results/Bgym-Llama-3.1-405b/miniwob.json
rename to results/GenericAgent-Llama-3.1-405b/miniwob.json
index ff6afc44fdd43fe48caa0c6ec642c04a356effe9..9b72c81f11cce770d9f799cedabbaeec110e4921 100644
--- a/results/Bgym-Llama-3.1-405b/miniwob.json
+++ b/results/GenericAgent-Llama-3.1-405b/miniwob.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"benchmark": "MiniWoB",
"score": 64.6,
diff --git a/results/Bgym-Llama-3.1-405b/webarena.json b/results/GenericAgent-Llama-3.1-405b/webarena.json
similarity index 88%
rename from results/Bgym-Llama-3.1-405b/webarena.json
rename to results/GenericAgent-Llama-3.1-405b/webarena.json
index 238b9745569a13ef80839afac0677f9e4c1e6839..995808e98aeb8c021efff48b20c32439101dc644 100644
--- a/results/Bgym-Llama-3.1-405b/webarena.json
+++ b/results/GenericAgent-Llama-3.1-405b/webarena.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"benchmark": "WebArena",
"score": 24.0,
diff --git a/results/Bgym-Llama-3.1-405b/weblinx.json b/results/GenericAgent-Llama-3.1-405b/weblinx.json
similarity index 87%
rename from results/Bgym-Llama-3.1-405b/weblinx.json
rename to results/GenericAgent-Llama-3.1-405b/weblinx.json
index 92d1ccc6ab1b525468bcd69fdafd0c40e6877967..f9ad37e621755c673c405e27b69ba526a85a3f7d 100644
--- a/results/Bgym-Llama-3.1-405b/weblinx.json
+++ b/results/GenericAgent-Llama-3.1-405b/weblinx.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"benchmark": "WebLINX",
"score": 7.9,
diff --git a/results/Bgym-Llama-3.1-405b/workarena-l1.json b/results/GenericAgent-Llama-3.1-405b/workarena-l1.json
similarity index 88%
rename from results/Bgym-Llama-3.1-405b/workarena-l1.json
rename to results/GenericAgent-Llama-3.1-405b/workarena-l1.json
index cd313d345eb0e117b62811b7b6479021b9ea54fc..1697c31799afbd33aa6f82158a6ac70bc7c7a8a1 100644
--- a/results/Bgym-Llama-3.1-405b/workarena-l1.json
+++ b/results/GenericAgent-Llama-3.1-405b/workarena-l1.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"benchmark": "WorkArena-L1",
"score": 43.3,
diff --git a/results/Bgym-Llama-3.1-405b/workarena-l2.json b/results/GenericAgent-Llama-3.1-405b/workarena-l2.json
similarity index 88%
rename from results/Bgym-Llama-3.1-405b/workarena-l2.json
rename to results/GenericAgent-Llama-3.1-405b/workarena-l2.json
index 391f94a227fb9b83edce81bd032751bc4838e5ca..1af5375b44abd648dc2223fcdb9ea079bb87c9d6 100644
--- a/results/Bgym-Llama-3.1-405b/workarena-l2.json
+++ b/results/GenericAgent-Llama-3.1-405b/workarena-l2.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L2",
diff --git a/results/Bgym-Llama-3.1-70b/workarena-l3.json b/results/GenericAgent-Llama-3.1-405b/workarena-l3.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/workarena-l3.json
rename to results/GenericAgent-Llama-3.1-405b/workarena-l3.json
index f5be1f90d3d048cc06696d79ab199297ce07e81c..5e520211e8ca9b333b9fe542dda04d0e6b417946 100644
--- a/results/Bgym-Llama-3.1-70b/workarena-l3.json
+++ b/results/GenericAgent-Llama-3.1-405b/workarena-l3.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-405b",
"study_id": "study_id",
"benchmark": "WorkArena-L3",
"score": 0.0,
diff --git a/results/Bgym-Llama-3.1-70b/README.md b/results/GenericAgent-Llama-3.1-70b/README.md
similarity index 100%
rename from results/Bgym-Llama-3.1-70b/README.md
rename to results/GenericAgent-Llama-3.1-70b/README.md
diff --git a/results/Bgym-Llama-3.1-70b/assistantbench.json b/results/GenericAgent-Llama-3.1-70b/assistantbench.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/assistantbench.json
rename to results/GenericAgent-Llama-3.1-70b/assistantbench.json
index 9a5bb61f45b6ea6a1f788364e35a4c55fe986373..49bc83231b840b79e57a8f2d896fe1012e9afce6 100644
--- a/results/Bgym-Llama-3.1-70b/assistantbench.json
+++ b/results/GenericAgent-Llama-3.1-70b/assistantbench.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"benchmark": "AssistantBench",
"score": 2.8,
diff --git a/results/Bgym-Llama-3.1-70b/miniwob.json b/results/GenericAgent-Llama-3.1-70b/miniwob.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/miniwob.json
rename to results/GenericAgent-Llama-3.1-70b/miniwob.json
index 037c919bac1cea16f4c53064f9816bb408210c40..dbde1c17d121e4a064a410a7a5ecc2921fdbe260 100644
--- a/results/Bgym-Llama-3.1-70b/miniwob.json
+++ b/results/GenericAgent-Llama-3.1-70b/miniwob.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"benchmark": "MiniWoB",
"score": 57.6,
diff --git a/results/Bgym-Llama-3.1-70b/webarena.json b/results/GenericAgent-Llama-3.1-70b/webarena.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/webarena.json
rename to results/GenericAgent-Llama-3.1-70b/webarena.json
index 80776cee4f94811d958dfeee356854d01ec66c8e..43beebc9688067b3bdf592d3a3d2e1db5b4fb6c0 100644
--- a/results/Bgym-Llama-3.1-70b/webarena.json
+++ b/results/GenericAgent-Llama-3.1-70b/webarena.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"benchmark": "WebArena",
"score": 18.4,
diff --git a/results/Bgym-Llama-3.1-70b/weblinx.json b/results/GenericAgent-Llama-3.1-70b/weblinx.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/weblinx.json
rename to results/GenericAgent-Llama-3.1-70b/weblinx.json
index 2031c283f003538a38aed3bda3beb1af660bcd42..b8b1fe95dbbf0861c22e8fb57c891d364e217470 100644
--- a/results/Bgym-Llama-3.1-70b/weblinx.json
+++ b/results/GenericAgent-Llama-3.1-70b/weblinx.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"benchmark": "WebLINX",
"score": 8.9,
diff --git a/results/Bgym-Llama-3.1-70b/workarena-l1.json b/results/GenericAgent-Llama-3.1-70b/workarena-l1.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/workarena-l1.json
rename to results/GenericAgent-Llama-3.1-70b/workarena-l1.json
index a9adb101bdae51e253add0e118656f129fd65b60..a7f0eef5144d214e8d904df1460d07256441d3db 100644
--- a/results/Bgym-Llama-3.1-70b/workarena-l1.json
+++ b/results/GenericAgent-Llama-3.1-70b/workarena-l1.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"benchmark": "WorkArena-L1",
"score": 27.9,
diff --git a/results/Bgym-Llama-3.1-70b/workarena-l2.json b/results/GenericAgent-Llama-3.1-70b/workarena-l2.json
similarity index 88%
rename from results/Bgym-Llama-3.1-70b/workarena-l2.json
rename to results/GenericAgent-Llama-3.1-70b/workarena-l2.json
index 19f54715a6e2fe577f6456baa9cb72eb80cbc814..96f6a47643e8ae54550e8a85f9fd76abce116aaf 100644
--- a/results/Bgym-Llama-3.1-70b/workarena-l2.json
+++ b/results/GenericAgent-Llama-3.1-70b/workarena-l2.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-70b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"date_time": "2021-01-01 12:00:00",
"benchmark": "WorkArena-L2",
diff --git a/results/Bgym-Llama-3.1-405b/workarena-l3.json b/results/GenericAgent-Llama-3.1-70b/workarena-l3.json
similarity index 88%
rename from results/Bgym-Llama-3.1-405b/workarena-l3.json
rename to results/GenericAgent-Llama-3.1-70b/workarena-l3.json
index 0b4c21ced555acbd3c31ffa9b72affd578084d4c..3801b2ebd8c41bd70d19394d31833d384321d97b 100644
--- a/results/Bgym-Llama-3.1-405b/workarena-l3.json
+++ b/results/GenericAgent-Llama-3.1-70b/workarena-l3.json
@@ -1,6 +1,6 @@
[
{
- "agent_name": "Bgym-Llama-3.1-405b",
+ "agent_name": "GenericAgent-Llama-3.1-70b",
"study_id": "study_id",
"benchmark": "WorkArena-L3",
"score": 0.0,
---|