diff --git a/app.py b/app.py index 5b0779ff902a83f865b7e2b51cb5d83b571430c7..f7285bbdee85a3d02161ddea0d34bc72024c4ac8 100644 --- a/app.py +++ b/app.py @@ -155,8 +155,9 @@ def create_html_table_benchmark(df, benchmark): html += '' html += '' for column in df.columns: - if column != "Reproduced_all": - html += f'' + if column == "Reproduced_all" or column == "std_err": + continue + html += f'' html += '' html += '' for _, row in df.iterrows(): @@ -169,7 +170,7 @@ def create_html_table_benchmark(df, benchmark): summary = sanitize_cell_value(row[column]) details = "
".join(map(sanitize_cell_value, row["Reproduced_all"])) html += f'' - elif column == "Reproduced_all": + elif column == "Reproduced_all" or column == "std_err": continue elif column == "Score": score_with_std_err = f'{row[column]} ± {row["std_err"]}' diff --git a/results/Bgym-GPT-3.5/README.md b/results/Bgym-GPT-3.5/README.md deleted file mode 100644 index f15589f8889e87445e98f51746ac426bcb81b9c0..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-3.5/README.md +++ /dev/null @@ -1 +0,0 @@ -## GPT-3.5 model \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/config.json b/results/Bgym-GPT-3.5/config.json deleted file mode 100644 index 3ea3825d4f9cc8568d9cdfb93eac5fcb48a07f86..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-3.5/config.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "agent_name": "GPT-3.5", - "backend_llm": "GPT-3.5" -} \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/miniwob.json b/results/Bgym-GPT-3.5/miniwob.json deleted file mode 100644 index b4d117fa36474116ffc4e8392cd375ccfeb52e6d..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-3.5/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-3.5", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "MiniWoB", - "score": 43.4, - "std_err": 0.1, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/webarena.json b/results/Bgym-GPT-3.5/webarena.json deleted file mode 100644 index 7b352122f2d3056156d07f11f086e85753b090e4..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-3.5/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-3.5", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WebArena", - "score": 6.7, - "std_err": 0.2, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/workarena-l1.json b/results/Bgym-GPT-3.5/workarena-l1.json deleted file mode 100644 index f163390b64e07051d8eddcfb1ddc4531a8f6af66..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-3.5/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-3.5", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L1", - "score": 6.1, - "std_err": 0.3, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-3.5/workarena-l2.json b/results/Bgym-GPT-3.5/workarena-l2.json deleted file mode 100644 index ad6ab82a380e20dd6ffad919d2d9860703bce2ee..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-3.5/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-3.5", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L2", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/README.md b/results/Bgym-GPT-4o-V/README.md deleted file mode 100644 index 065c2f2bbfe5c0845debe1baa0a086f2dd2c019a..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-4o-V/README.md +++ /dev/null @@ -1 +0,0 @@ -## GPT-4o-V model \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/miniwob.json b/results/Bgym-GPT-4o-V/miniwob.json deleted file mode 100644 index 1090c29ca8017fddd7eb43d7de424c4ef5115f7c..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-4o-V/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-4o-V", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "MiniWoB", - "score": 72.5, - "std_err": 0.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/webarena.json b/results/Bgym-GPT-4o-V/webarena.json deleted file mode 100644 index 4908982e7d053542eeaaf8f2410aa794dc05d52b..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-4o-V/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-4o-V", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WebArena", - "score": 24.0, - "std_err": 0.4, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/workarena-l1.json b/results/Bgym-GPT-4o-V/workarena-l1.json deleted file mode 100644 index ed6776d3ca134d76b4e69886ba372792b5ea212a..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-4o-V/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-4o-V", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L1", - "score": 41.8, - "std_err": 0.4, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/workarena-l2.json b/results/Bgym-GPT-4o-V/workarena-l2.json deleted file mode 100644 index 25e2c312fd03d6b61211943add71430d7bbf1003..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-4o-V/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-4o-V", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L2", - "score": 3.8, - "std_err": 0.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-4o-V/workarena-l3.json b/results/Bgym-GPT-4o-V/workarena-l3.json deleted file mode 100644 index e9b990349435d7c131cee59d0dc559d4dafbd377..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-4o-V/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-4o-V", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-GPT-o1-mini/workarena-l3.json b/results/Bgym-GPT-o1-mini/workarena-l3.json deleted file mode 100644 index 27de64528f60146373d975b03208772685c135c8..0000000000000000000000000000000000000000 --- a/results/Bgym-GPT-o1-mini/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-GPT-o1-mini", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/README.md b/results/Bgym-Llama-3-70b/README.md deleted file mode 100644 index 8798ff4c72825c5049ef24cf16b818faa9ae5d2b..0000000000000000000000000000000000000000 --- a/results/Bgym-Llama-3-70b/README.md +++ /dev/null @@ -1 +0,0 @@ -### Llama-3-70B \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/miniwob.json b/results/Bgym-Llama-3-70b/miniwob.json deleted file mode 100644 index 5dadad99d077784d8a2c662c6262722c34e834f8..0000000000000000000000000000000000000000 --- a/results/Bgym-Llama-3-70b/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Llama-3-70b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "MiniWoB", - "score": 68.2, - "std_err": 0.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/webarena.json b/results/Bgym-Llama-3-70b/webarena.json deleted file mode 100644 index 6c229ed5d2d97ac3623759b8f7f7131a3830abea..0000000000000000000000000000000000000000 --- a/results/Bgym-Llama-3-70b/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Llama-3-70b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WebArena", - "score": 11.0, - "std_err": 0.3, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/workarena-l1.json b/results/Bgym-Llama-3-70b/workarena-l1.json deleted file mode 100644 index 4ffaba752e5d843fe7d02089c499881dd3e321e6..0000000000000000000000000000000000000000 --- a/results/Bgym-Llama-3-70b/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Llama-3-70b", - "study_id": "study_id", - "benchmark": "WorkArena-L1", - "score": 17.9, - "std_err": 0.6, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2021-01-01 12:00:00" - } -] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/workarena-l2.json b/results/Bgym-Llama-3-70b/workarena-l2.json deleted file mode 100644 index 0f0f8451f47f7022fe3b85923f9356c2f1617c15..0000000000000000000000000000000000000000 --- a/results/Bgym-Llama-3-70b/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Llama-3-70b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L2", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Llama-3-70b/workarena-l3.json b/results/Bgym-Llama-3-70b/workarena-l3.json deleted file mode 100644 index acf0a81f58b3ab0e03178ad4f7ae27cb9fec4e97..0000000000000000000000000000000000000000 --- a/results/Bgym-Llama-3-70b/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Llama-3-70b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/README.md b/results/Bgym-Mixtral-8x22b/README.md deleted file mode 100644 index 25b17de698790810b5a434228a08691aa048e4ca..0000000000000000000000000000000000000000 --- a/results/Bgym-Mixtral-8x22b/README.md +++ /dev/null @@ -1 +0,0 @@ -## Mixtral 8x22B \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/miniwob.json b/results/Bgym-Mixtral-8x22b/miniwob.json deleted file mode 100644 index 0b6ea125b66d8032f19d3922284cadcfe7e5b957..0000000000000000000000000000000000000000 --- a/results/Bgym-Mixtral-8x22b/miniwob.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Mixtral-8x22b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "MiniWoB", - "score": 62.4, - "std_err": 0.5, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/webarena.json b/results/Bgym-Mixtral-8x22b/webarena.json deleted file mode 100644 index 823344e109e805942efd8e34caa90d2e4a0c4d33..0000000000000000000000000000000000000000 --- a/results/Bgym-Mixtral-8x22b/webarena.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Mixtral-8x22b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WebArena", - "score": 12.6, - "std_err": 0.9, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/workarena-l1.json b/results/Bgym-Mixtral-8x22b/workarena-l1.json deleted file mode 100644 index 80661f71df672452d7ae4b1cf8af6edfcdd75f94..0000000000000000000000000000000000000000 --- a/results/Bgym-Mixtral-8x22b/workarena-l1.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Mixtral-8x22b", - "study_id": "study_id", - "benchmark": "WorkArena-L1", - "score": 12.4, - "std_err": 0.7, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original", - "date_time": "2021-01-04 12:06:00" - } -] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/workarena-l2.json b/results/Bgym-Mixtral-8x22b/workarena-l2.json deleted file mode 100644 index fbc2324755d166804d3586bf0f54766c3c940232..0000000000000000000000000000000000000000 --- a/results/Bgym-Mixtral-8x22b/workarena-l2.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Mixtral-8x22b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L2", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Mixtral-8x22b/workarena-l3.json b/results/Bgym-Mixtral-8x22b/workarena-l3.json deleted file mode 100644 index 2cfe04fa51a97f69d7172f239484939e0716102a..0000000000000000000000000000000000000000 --- a/results/Bgym-Mixtral-8x22b/workarena-l3.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "agent_name": "Bgym-Mixtral-8x22b", - "study_id": "study_id", - "date_time": "2021-01-01 12:00:00", - "benchmark": "WorkArena-L3", - "score": 0.0, - "std_err": 0.0, - "benchmark_specific": "No", - "benchmark_tuned": "No", - "followed_evaluation_protocol": "Yes", - "reproducible": "Yes", - "comments": "NA", - "original_or_reproduced": "Original" - } -] \ No newline at end of file diff --git a/results/Bgym-Claude-3.5-Sonnet/README.md b/results/GenericAgent-Claude-3.5-Sonnet/README.md similarity index 100% rename from results/Bgym-Claude-3.5-Sonnet/README.md rename to results/GenericAgent-Claude-3.5-Sonnet/README.md diff --git a/results/Bgym-Claude-3.5-Sonnet/assistantbench.json b/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/assistantbench.json rename to results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json index c4dc58a834bd8d454e15cf824fe700da52d1a774..2817e97e8381f7b5e7f0feea80ced0481f14740b 100644 --- a/results/Bgym-Claude-3.5-Sonnet/assistantbench.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "benchmark": "AssistantBench", "score": 5.2, diff --git a/results/Bgym-Claude-3.5-Sonnet/miniwob.json b/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/miniwob.json rename to results/GenericAgent-Claude-3.5-Sonnet/miniwob.json index 7ac843b450c25e9079820e355024542a714f8672..d123c6ace4239ce3ad050deec84cbf066df393e5 100644 --- a/results/Bgym-Claude-3.5-Sonnet/miniwob.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/miniwob.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "benchmark": "MiniWoB", "score": 69.8, diff --git a/results/Bgym-Claude-3.5-Sonnet/webarena.json b/results/GenericAgent-Claude-3.5-Sonnet/webarena.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/webarena.json rename to results/GenericAgent-Claude-3.5-Sonnet/webarena.json index d00d09d6f46bf170b56d4ac765a0bad6cfc19a79..2986f01595d3e3989e51b25268e1a0e71f78582b 100644 --- a/results/Bgym-Claude-3.5-Sonnet/webarena.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/webarena.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "benchmark": "WebArena", "score": 36.2, diff --git a/results/Bgym-Claude-3.5-Sonnet/weblinx.json b/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/weblinx.json rename to results/GenericAgent-Claude-3.5-Sonnet/weblinx.json index 6763e3b0f5d9d66b698b6bbf2e6921b4572a1eaa..e5aedb085b87a982ad7aef88dc8d640aad784fb8 100644 --- a/results/Bgym-Claude-3.5-Sonnet/weblinx.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/weblinx.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "benchmark": "WebLINX", "score": 13.7, diff --git a/results/Bgym-Claude-3.5-Sonnet/workarena-l1.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/workarena-l1.json rename to results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json index ed2c2c2097974bbc7c3845fd8915d6588db7698a..7cb49d00ba787ec9d1c4903755b367e241adc4f9 100644 --- a/results/Bgym-Claude-3.5-Sonnet/workarena-l1.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "benchmark": "WorkArena-L1", "score": 56.4, diff --git a/results/Bgym-Claude-3.5-Sonnet/workarena-l2.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/workarena-l2.json rename to results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json index c48c1af20788e1ceb58a90ea9e36bda55db6cfca..b66fb16b550a058b359fa90eb44364104437d7b2 100644 --- a/results/Bgym-Claude-3.5-Sonnet/workarena-l2.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l2.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", diff --git a/results/Bgym-Claude-3.5-Sonnet/workarena-l3.json b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json similarity index 87% rename from results/Bgym-Claude-3.5-Sonnet/workarena-l3.json rename to results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json index 9f21ceedef7c1a025ae86e197785dea2f83a7e06..522438fcb0792c3ccdb50b8a19576bd70e988345 100644 --- a/results/Bgym-Claude-3.5-Sonnet/workarena-l3.json +++ b/results/GenericAgent-Claude-3.5-Sonnet/workarena-l3.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Claude-3.5-Sonnet", + "agent_name": "GenericAgent-Claude-3.5-Sonnet", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", diff --git a/results/Bgym-GPT-4o-mini/README.md b/results/GenericAgent-GPT-4o-mini/README.md similarity index 100% rename from results/Bgym-GPT-4o-mini/README.md rename to results/GenericAgent-GPT-4o-mini/README.md diff --git a/results/Bgym-GPT-4o-mini/assistantbench.json b/results/GenericAgent-GPT-4o-mini/assistantbench.json similarity index 88% rename from results/Bgym-GPT-4o-mini/assistantbench.json rename to results/GenericAgent-GPT-4o-mini/assistantbench.json index 9428ea816ad49681f3cb4c993d0e68dedd3eea41..e29481fb2c87e364ac7fcd236caddee1b1a1d176 100644 --- a/results/Bgym-GPT-4o-mini/assistantbench.json +++ b/results/GenericAgent-GPT-4o-mini/assistantbench.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "AssistantBench", diff --git a/results/Bgym-GPT-4o-mini/miniwob.json b/results/GenericAgent-GPT-4o-mini/miniwob.json similarity index 88% rename from results/Bgym-GPT-4o-mini/miniwob.json rename to results/GenericAgent-GPT-4o-mini/miniwob.json index 75d99b96bcf7d72ea41c49022be9b294b897b79c..24262492859411f33f84701017a01c37bbd0a332 100644 --- a/results/Bgym-GPT-4o-mini/miniwob.json +++ b/results/GenericAgent-GPT-4o-mini/miniwob.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "MiniWoB", diff --git a/results/Bgym-GPT-4o-mini/webarena.json b/results/GenericAgent-GPT-4o-mini/webarena.json similarity index 88% rename from results/Bgym-GPT-4o-mini/webarena.json rename to results/GenericAgent-GPT-4o-mini/webarena.json index 1c59caff4a0c1f822f3ef2fb9698eb16331df56b..0bd123ecdbbfbeca4be421548b05328f8893aba6 100644 --- a/results/Bgym-GPT-4o-mini/webarena.json +++ b/results/GenericAgent-GPT-4o-mini/webarena.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WebArena", diff --git a/results/Bgym-GPT-4o-mini/weblinx.json b/results/GenericAgent-GPT-4o-mini/weblinx.json similarity index 88% rename from results/Bgym-GPT-4o-mini/weblinx.json rename to results/GenericAgent-GPT-4o-mini/weblinx.json index 356eb5d5c607ca8e12a05ec651d2d47a41144082..ec560a120d2a5e376c98497608705d7c503f4176 100644 --- a/results/Bgym-GPT-4o-mini/weblinx.json +++ b/results/GenericAgent-GPT-4o-mini/weblinx.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WebLINX", diff --git a/results/Bgym-GPT-4o-mini/workarena-l1.json b/results/GenericAgent-GPT-4o-mini/workarena-l1.json similarity index 88% rename from results/Bgym-GPT-4o-mini/workarena-l1.json rename to results/GenericAgent-GPT-4o-mini/workarena-l1.json index f4f444b46e6e6ea843e14ac799d1b510c98f66b5..ae2e1c6b0527cdda8fe18ab685c18081f504208c 100644 --- a/results/Bgym-GPT-4o-mini/workarena-l1.json +++ b/results/GenericAgent-GPT-4o-mini/workarena-l1.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L1", diff --git a/results/Bgym-GPT-4o-mini/workarena-l2.json b/results/GenericAgent-GPT-4o-mini/workarena-l2.json similarity index 88% rename from results/Bgym-GPT-4o-mini/workarena-l2.json rename to results/GenericAgent-GPT-4o-mini/workarena-l2.json index 1c80ee97e16dd3688262007ce69b3c7dfa98720f..6901a7ef5ec6a12e1c15ad50b98b926ee425a97b 100644 --- a/results/Bgym-GPT-4o-mini/workarena-l2.json +++ b/results/GenericAgent-GPT-4o-mini/workarena-l2.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", diff --git a/results/Bgym-GPT-4o-mini/workarena-l3.json b/results/GenericAgent-GPT-4o-mini/workarena-l3.json similarity index 88% rename from results/Bgym-GPT-4o-mini/workarena-l3.json rename to results/GenericAgent-GPT-4o-mini/workarena-l3.json index 27ef5630215cb85ece695d757831a94745ca11a0..29a97656776d58983de5dc16bd438b57ae5d9116 100644 --- a/results/Bgym-GPT-4o-mini/workarena-l3.json +++ b/results/GenericAgent-GPT-4o-mini/workarena-l3.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o-mini", + "agent_name": "GenericAgent-GPT-4o-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", diff --git a/results/Bgym-GPT-4o/README.md b/results/GenericAgent-GPT-4o/README.md similarity index 100% rename from results/Bgym-GPT-4o/README.md rename to results/GenericAgent-GPT-4o/README.md diff --git a/results/Bgym-GPT-4o/assistantbench.json b/results/GenericAgent-GPT-4o/assistantbench.json similarity index 89% rename from results/Bgym-GPT-4o/assistantbench.json rename to results/GenericAgent-GPT-4o/assistantbench.json index 7223e5586bdcb3d5f46987a6ffa5d33abdd17966..04c05cc788cc38f2b67f8dae5ace91d4fac187ee 100644 --- a/results/Bgym-GPT-4o/assistantbench.json +++ b/results/GenericAgent-GPT-4o/assistantbench.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "AssistantBench", diff --git a/results/Bgym-GPT-4o/miniwob.json b/results/GenericAgent-GPT-4o/miniwob.json similarity index 89% rename from results/Bgym-GPT-4o/miniwob.json rename to results/GenericAgent-GPT-4o/miniwob.json index a4dc858ac1c479280afd766c79918b3206ccef4c..717377ed01beed367ec089d296547f8d017f9fdb 100644 --- a/results/Bgym-GPT-4o/miniwob.json +++ b/results/GenericAgent-GPT-4o/miniwob.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "MiniWoB", diff --git a/results/Bgym-GPT-4o/webarena.json b/results/GenericAgent-GPT-4o/webarena.json similarity index 89% rename from results/Bgym-GPT-4o/webarena.json rename to results/GenericAgent-GPT-4o/webarena.json index 30684c7d9644bca29805a47bf38c43506ea4cf84..947aafbb445d03814411d4fb4daf0ff88cd3e10f 100644 --- a/results/Bgym-GPT-4o/webarena.json +++ b/results/GenericAgent-GPT-4o/webarena.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WebArena", diff --git a/results/Bgym-GPT-4o/weblinx.json b/results/GenericAgent-GPT-4o/weblinx.json similarity index 89% rename from results/Bgym-GPT-4o/weblinx.json rename to results/GenericAgent-GPT-4o/weblinx.json index d0eb1f5a22bc287a4132092b9bec567c9ad1e51c..0e6cc2cda023d01b9333bb6e5c5975d8841b87d7 100644 --- a/results/Bgym-GPT-4o/weblinx.json +++ b/results/GenericAgent-GPT-4o/weblinx.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WebLINX", diff --git a/results/Bgym-GPT-4o/workarena-l1.json b/results/GenericAgent-GPT-4o/workarena-l1.json similarity index 89% rename from results/Bgym-GPT-4o/workarena-l1.json rename to results/GenericAgent-GPT-4o/workarena-l1.json index 82191533728bfed194fbf7caf77779894000752e..597df5c7a7dc9c3c8ee38aa2ca90f76b27dd5e8c 100644 --- a/results/Bgym-GPT-4o/workarena-l1.json +++ b/results/GenericAgent-GPT-4o/workarena-l1.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L1", diff --git a/results/Bgym-GPT-4o/workarena-l2.json b/results/GenericAgent-GPT-4o/workarena-l2.json similarity index 89% rename from results/Bgym-GPT-4o/workarena-l2.json rename to results/GenericAgent-GPT-4o/workarena-l2.json index c6739539b0befba0de7bcf3905c5c9d542b97d3d..4bf9f5cf0928b5cf23380d03de268cea99e3f10e 100644 --- a/results/Bgym-GPT-4o/workarena-l2.json +++ b/results/GenericAgent-GPT-4o/workarena-l2.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", diff --git a/results/Bgym-GPT-4o/workarena-l3.json b/results/GenericAgent-GPT-4o/workarena-l3.json similarity index 89% rename from results/Bgym-GPT-4o/workarena-l3.json rename to results/GenericAgent-GPT-4o/workarena-l3.json index a5ec6780ea9ca807b4a8ef4d401ca14d33f7e3d6..bc56a8e64b483cc3c5163f57718af7711fc48840 100644 --- a/results/Bgym-GPT-4o/workarena-l3.json +++ b/results/GenericAgent-GPT-4o/workarena-l3.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-4o", + "agent_name": "GenericAgent-GPT-4o", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", diff --git a/results/Bgym-GPT-o1-mini/README.md b/results/GenericAgent-GPT-o1-mini/README.md similarity index 100% rename from results/Bgym-GPT-o1-mini/README.md rename to results/GenericAgent-GPT-o1-mini/README.md diff --git a/results/Bgym-GPT-o1-mini/assistantbench.json b/results/GenericAgent-GPT-o1-mini/assistantbench.json similarity index 88% rename from results/Bgym-GPT-o1-mini/assistantbench.json rename to results/GenericAgent-GPT-o1-mini/assistantbench.json index 307925dae4e75587aa979065062c87238dbb8a05..3b3c545c15ddd2f496be318466be01260b44e15d 100644 --- a/results/Bgym-GPT-o1-mini/assistantbench.json +++ b/results/GenericAgent-GPT-o1-mini/assistantbench.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-o1-mini", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "AssistantBench", diff --git a/results/Bgym-GPT-o1-mini/miniwob.json b/results/GenericAgent-GPT-o1-mini/miniwob.json similarity index 88% rename from results/Bgym-GPT-o1-mini/miniwob.json rename to results/GenericAgent-GPT-o1-mini/miniwob.json index 3483d9051f7751e8fc37713744ad21a74a27066c..46e6e4937b729d992687cdaf838ac67cc814df77 100644 --- a/results/Bgym-GPT-o1-mini/miniwob.json +++ b/results/GenericAgent-GPT-o1-mini/miniwob.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-o1-mini", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "MiniWoB", diff --git a/results/Bgym-GPT-o1-mini/webarena.json b/results/GenericAgent-GPT-o1-mini/webarena.json similarity index 88% rename from results/Bgym-GPT-o1-mini/webarena.json rename to results/GenericAgent-GPT-o1-mini/webarena.json index 369b02fec15d5e1f5a980f515693e77fc4ecd272..861c3cffe85efd49e9d62eb4d5093829ef52fe10 100644 --- a/results/Bgym-GPT-o1-mini/webarena.json +++ b/results/GenericAgent-GPT-o1-mini/webarena.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-o1-mini", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WebArena", diff --git a/results/Bgym-GPT-o1-mini/weblinx.json b/results/GenericAgent-GPT-o1-mini/weblinx.json similarity index 88% rename from results/Bgym-GPT-o1-mini/weblinx.json rename to results/GenericAgent-GPT-o1-mini/weblinx.json index 8ceeb6a6aab13421c37130e7c29cd958be2a6f44..cf0a5b827c987bd0870a6405bef252b2e0a7ac69 100644 --- a/results/Bgym-GPT-o1-mini/weblinx.json +++ b/results/GenericAgent-GPT-o1-mini/weblinx.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-o1-mini", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WebLINX", diff --git a/results/Bgym-GPT-o1-mini/workarena-l1.json b/results/GenericAgent-GPT-o1-mini/workarena-l1.json similarity index 88% rename from results/Bgym-GPT-o1-mini/workarena-l1.json rename to results/GenericAgent-GPT-o1-mini/workarena-l1.json index fac67e3ab9222bb24f82f2e60b7eedee0388b520..9bc93070836fa763371e27350d8a14c797ef9f6d 100644 --- a/results/Bgym-GPT-o1-mini/workarena-l1.json +++ b/results/GenericAgent-GPT-o1-mini/workarena-l1.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-o1-mini", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L1", diff --git a/results/Bgym-GPT-o1-mini/workarena-l2.json b/results/GenericAgent-GPT-o1-mini/workarena-l2.json similarity index 88% rename from results/Bgym-GPT-o1-mini/workarena-l2.json rename to results/GenericAgent-GPT-o1-mini/workarena-l2.json index c57f1d30c9455651f302d654ccedfb1ee30eabf4..292db2bf5f826ae09fd1d8043022c20100bc3a62 100644 --- a/results/Bgym-GPT-o1-mini/workarena-l2.json +++ b/results/GenericAgent-GPT-o1-mini/workarena-l2.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-o1-mini", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", diff --git a/results/Bgym-GPT-3.5/workarena-l3.json b/results/GenericAgent-GPT-o1-mini/workarena-l3.json similarity index 88% rename from results/Bgym-GPT-3.5/workarena-l3.json rename to results/GenericAgent-GPT-o1-mini/workarena-l3.json index 40093a485842f340d16d25af5768e8d066377a05..97a2b1e29833b771a00942646bc380f9b0b14172 100644 --- a/results/Bgym-GPT-3.5/workarena-l3.json +++ b/results/GenericAgent-GPT-o1-mini/workarena-l3.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-GPT-3.5", + "agent_name": "GenericAgent-GPT-o1-mini", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L3", diff --git a/results/Bgym-Llama-3.1-405b/README.md b/results/GenericAgent-Llama-3.1-405b/README.md similarity index 100% rename from results/Bgym-Llama-3.1-405b/README.md rename to results/GenericAgent-Llama-3.1-405b/README.md diff --git a/results/Bgym-Llama-3.1-405b/assistantbench.json b/results/GenericAgent-Llama-3.1-405b/assistantbench.json similarity index 88% rename from results/Bgym-Llama-3.1-405b/assistantbench.json rename to results/GenericAgent-Llama-3.1-405b/assistantbench.json index 7a6aef8a8b7f7a21b7387ba8b014b7d12e080c1f..6d42af475973c54772ef675d96074b0ec8b16bc5 100644 --- a/results/Bgym-Llama-3.1-405b/assistantbench.json +++ b/results/GenericAgent-Llama-3.1-405b/assistantbench.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "benchmark": "AssistantBench", "score": 3.9, diff --git a/results/Bgym-Llama-3.1-405b/miniwob.json b/results/GenericAgent-Llama-3.1-405b/miniwob.json similarity index 87% rename from results/Bgym-Llama-3.1-405b/miniwob.json rename to results/GenericAgent-Llama-3.1-405b/miniwob.json index ff6afc44fdd43fe48caa0c6ec642c04a356effe9..9b72c81f11cce770d9f799cedabbaeec110e4921 100644 --- a/results/Bgym-Llama-3.1-405b/miniwob.json +++ b/results/GenericAgent-Llama-3.1-405b/miniwob.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "benchmark": "MiniWoB", "score": 64.6, diff --git a/results/Bgym-Llama-3.1-405b/webarena.json b/results/GenericAgent-Llama-3.1-405b/webarena.json similarity index 88% rename from results/Bgym-Llama-3.1-405b/webarena.json rename to results/GenericAgent-Llama-3.1-405b/webarena.json index 238b9745569a13ef80839afac0677f9e4c1e6839..995808e98aeb8c021efff48b20c32439101dc644 100644 --- a/results/Bgym-Llama-3.1-405b/webarena.json +++ b/results/GenericAgent-Llama-3.1-405b/webarena.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "benchmark": "WebArena", "score": 24.0, diff --git a/results/Bgym-Llama-3.1-405b/weblinx.json b/results/GenericAgent-Llama-3.1-405b/weblinx.json similarity index 87% rename from results/Bgym-Llama-3.1-405b/weblinx.json rename to results/GenericAgent-Llama-3.1-405b/weblinx.json index 92d1ccc6ab1b525468bcd69fdafd0c40e6877967..f9ad37e621755c673c405e27b69ba526a85a3f7d 100644 --- a/results/Bgym-Llama-3.1-405b/weblinx.json +++ b/results/GenericAgent-Llama-3.1-405b/weblinx.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "benchmark": "WebLINX", "score": 7.9, diff --git a/results/Bgym-Llama-3.1-405b/workarena-l1.json b/results/GenericAgent-Llama-3.1-405b/workarena-l1.json similarity index 88% rename from results/Bgym-Llama-3.1-405b/workarena-l1.json rename to results/GenericAgent-Llama-3.1-405b/workarena-l1.json index cd313d345eb0e117b62811b7b6479021b9ea54fc..1697c31799afbd33aa6f82158a6ac70bc7c7a8a1 100644 --- a/results/Bgym-Llama-3.1-405b/workarena-l1.json +++ b/results/GenericAgent-Llama-3.1-405b/workarena-l1.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "benchmark": "WorkArena-L1", "score": 43.3, diff --git a/results/Bgym-Llama-3.1-405b/workarena-l2.json b/results/GenericAgent-Llama-3.1-405b/workarena-l2.json similarity index 88% rename from results/Bgym-Llama-3.1-405b/workarena-l2.json rename to results/GenericAgent-Llama-3.1-405b/workarena-l2.json index 391f94a227fb9b83edce81bd032751bc4838e5ca..1af5375b44abd648dc2223fcdb9ea079bb87c9d6 100644 --- a/results/Bgym-Llama-3.1-405b/workarena-l2.json +++ b/results/GenericAgent-Llama-3.1-405b/workarena-l2.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", diff --git a/results/Bgym-Llama-3.1-70b/workarena-l3.json b/results/GenericAgent-Llama-3.1-405b/workarena-l3.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/workarena-l3.json rename to results/GenericAgent-Llama-3.1-405b/workarena-l3.json index f5be1f90d3d048cc06696d79ab199297ce07e81c..5e520211e8ca9b333b9fe542dda04d0e6b417946 100644 --- a/results/Bgym-Llama-3.1-70b/workarena-l3.json +++ b/results/GenericAgent-Llama-3.1-405b/workarena-l3.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-405b", "study_id": "study_id", "benchmark": "WorkArena-L3", "score": 0.0, diff --git a/results/Bgym-Llama-3.1-70b/README.md b/results/GenericAgent-Llama-3.1-70b/README.md similarity index 100% rename from results/Bgym-Llama-3.1-70b/README.md rename to results/GenericAgent-Llama-3.1-70b/README.md diff --git a/results/Bgym-Llama-3.1-70b/assistantbench.json b/results/GenericAgent-Llama-3.1-70b/assistantbench.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/assistantbench.json rename to results/GenericAgent-Llama-3.1-70b/assistantbench.json index 9a5bb61f45b6ea6a1f788364e35a4c55fe986373..49bc83231b840b79e57a8f2d896fe1012e9afce6 100644 --- a/results/Bgym-Llama-3.1-70b/assistantbench.json +++ b/results/GenericAgent-Llama-3.1-70b/assistantbench.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "benchmark": "AssistantBench", "score": 2.8, diff --git a/results/Bgym-Llama-3.1-70b/miniwob.json b/results/GenericAgent-Llama-3.1-70b/miniwob.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/miniwob.json rename to results/GenericAgent-Llama-3.1-70b/miniwob.json index 037c919bac1cea16f4c53064f9816bb408210c40..dbde1c17d121e4a064a410a7a5ecc2921fdbe260 100644 --- a/results/Bgym-Llama-3.1-70b/miniwob.json +++ b/results/GenericAgent-Llama-3.1-70b/miniwob.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "benchmark": "MiniWoB", "score": 57.6, diff --git a/results/Bgym-Llama-3.1-70b/webarena.json b/results/GenericAgent-Llama-3.1-70b/webarena.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/webarena.json rename to results/GenericAgent-Llama-3.1-70b/webarena.json index 80776cee4f94811d958dfeee356854d01ec66c8e..43beebc9688067b3bdf592d3a3d2e1db5b4fb6c0 100644 --- a/results/Bgym-Llama-3.1-70b/webarena.json +++ b/results/GenericAgent-Llama-3.1-70b/webarena.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "benchmark": "WebArena", "score": 18.4, diff --git a/results/Bgym-Llama-3.1-70b/weblinx.json b/results/GenericAgent-Llama-3.1-70b/weblinx.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/weblinx.json rename to results/GenericAgent-Llama-3.1-70b/weblinx.json index 2031c283f003538a38aed3bda3beb1af660bcd42..b8b1fe95dbbf0861c22e8fb57c891d364e217470 100644 --- a/results/Bgym-Llama-3.1-70b/weblinx.json +++ b/results/GenericAgent-Llama-3.1-70b/weblinx.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "benchmark": "WebLINX", "score": 8.9, diff --git a/results/Bgym-Llama-3.1-70b/workarena-l1.json b/results/GenericAgent-Llama-3.1-70b/workarena-l1.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/workarena-l1.json rename to results/GenericAgent-Llama-3.1-70b/workarena-l1.json index a9adb101bdae51e253add0e118656f129fd65b60..a7f0eef5144d214e8d904df1460d07256441d3db 100644 --- a/results/Bgym-Llama-3.1-70b/workarena-l1.json +++ b/results/GenericAgent-Llama-3.1-70b/workarena-l1.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "benchmark": "WorkArena-L1", "score": 27.9, diff --git a/results/Bgym-Llama-3.1-70b/workarena-l2.json b/results/GenericAgent-Llama-3.1-70b/workarena-l2.json similarity index 88% rename from results/Bgym-Llama-3.1-70b/workarena-l2.json rename to results/GenericAgent-Llama-3.1-70b/workarena-l2.json index 19f54715a6e2fe577f6456baa9cb72eb80cbc814..96f6a47643e8ae54550e8a85f9fd76abce116aaf 100644 --- a/results/Bgym-Llama-3.1-70b/workarena-l2.json +++ b/results/GenericAgent-Llama-3.1-70b/workarena-l2.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-70b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "date_time": "2021-01-01 12:00:00", "benchmark": "WorkArena-L2", diff --git a/results/Bgym-Llama-3.1-405b/workarena-l3.json b/results/GenericAgent-Llama-3.1-70b/workarena-l3.json similarity index 88% rename from results/Bgym-Llama-3.1-405b/workarena-l3.json rename to results/GenericAgent-Llama-3.1-70b/workarena-l3.json index 0b4c21ced555acbd3c31ffa9b72affd578084d4c..3801b2ebd8c41bd70d19394d31833d384321d97b 100644 --- a/results/Bgym-Llama-3.1-405b/workarena-l3.json +++ b/results/GenericAgent-Llama-3.1-70b/workarena-l3.json @@ -1,6 +1,6 @@ [ { - "agent_name": "Bgym-Llama-3.1-405b", + "agent_name": "GenericAgent-Llama-3.1-70b", "study_id": "study_id", "benchmark": "WorkArena-L3", "score": 0.0,
{sanitize_column_name(column)}{sanitize_column_name(column)}
{summary}{details}