Added readme, visualwebarena
Browse files- app.py +1 -1
- results/GenericAgent-Claude-3.5-Sonnet/README.md +3 -1
- results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +16 -0
- results/GenericAgent-GPT-4o-mini/visualwebarena.json +16 -0
- results/GenericAgent-GPT-4o/README.md +46 -1
- results/GenericAgent-GPT-4o/visualwebarena.json +16 -0
- results/GenericAgent-GPT-o1-mini/README.md +46 -1
- results/GenericAgent-Llama-3.1-405b/README.md +46 -1
- results/GenericAgent-Llama-3.1-70b/README.md +46 -1
app.py
CHANGED
@@ -17,7 +17,7 @@ import re
|
|
17 |
import html
|
18 |
from typing import Dict, Any
|
19 |
|
20 |
-
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "AssistantBench"]
|
21 |
|
22 |
def sanitize_agent_name(agent_name):
|
23 |
# Only allow alphanumeric chars, hyphen, underscore
|
|
|
17 |
import html
|
18 |
from typing import Dict, Any
|
19 |
|
20 |
+
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
|
21 |
|
22 |
def sanitize_agent_name(agent_name):
|
23 |
# Only allow alphanumeric chars, hyphen, underscore
|
results/GenericAgent-Claude-3.5-Sonnet/README.md
CHANGED
@@ -41,4 +41,6 @@ BASE_FLAGS = GenericPromptFlags(
|
|
41 |
be_cautious=True,
|
42 |
extra_instructions=None,
|
43 |
)
|
44 |
-
```
|
|
|
|
|
|
41 |
be_cautious=True,
|
42 |
extra_instructions=None,
|
43 |
)
|
44 |
+
```
|
45 |
+
|
46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "VisualWebArena",
|
6 |
+
"score": 21.0,
|
7 |
+
"std_err": 1.3,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-4o-mini/visualwebarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "VisualWebArena",
|
7 |
+
"score": 16.9,
|
8 |
+
"std_err": 1.2,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-4o/README.md
CHANGED
@@ -1 +1,46 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-GPT-4o
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True, # gpt-4o config except for this line
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
45 |
+
|
46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-GPT-4o/visualwebarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "VisualWebArena",
|
7 |
+
"score": 26.7,
|
8 |
+
"std_err": 1.5,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-GPT-o1-mini/README.md
CHANGED
@@ -1 +1,46 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-GPT-o1-mini
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True, # gpt-4o config except for this line
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
45 |
+
|
46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-Llama-3.1-405b/README.md
CHANGED
@@ -1 +1,46 @@
|
|
1 |
-
### Llama-3.1-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-Llama-3.1-405b
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses Llama-3.1-405b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True, # gpt-4o config except for this line
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
45 |
+
|
46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-Llama-3.1-70b/README.md
CHANGED
@@ -1 +1,46 @@
|
|
1 |
-
### Llama-3.1-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### GenericAgent-Llama-3.1-70b
|
2 |
+
|
3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
+
|
5 |
+
It uses Llama-3.1-70b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
+
```python
|
7 |
+
BASE_FLAGS = GenericPromptFlags(
|
8 |
+
obs=dp.ObsFlags(
|
9 |
+
use_html=False,
|
10 |
+
use_ax_tree=True,
|
11 |
+
use_focused_element=True,
|
12 |
+
use_error_logs=True,
|
13 |
+
use_history=True,
|
14 |
+
use_past_error_logs=False,
|
15 |
+
use_action_history=True,
|
16 |
+
use_think_history=True, # gpt-4o config except for this line
|
17 |
+
use_diff=False,
|
18 |
+
html_type="pruned_html",
|
19 |
+
use_screenshot=False,
|
20 |
+
use_som=False,
|
21 |
+
extract_visible_tag=True,
|
22 |
+
extract_clickable_tag=True,
|
23 |
+
extract_coords="False",
|
24 |
+
filter_visible_elements_only=False,
|
25 |
+
),
|
26 |
+
action=dp.ActionFlags(
|
27 |
+
multi_actions=False,
|
28 |
+
action_set="bid",
|
29 |
+
long_description=False,
|
30 |
+
individual_examples=False,
|
31 |
+
),
|
32 |
+
use_plan=False,
|
33 |
+
use_criticise=False,
|
34 |
+
use_thinking=True,
|
35 |
+
use_memory=False,
|
36 |
+
use_concrete_example=True,
|
37 |
+
use_abstract_example=True,
|
38 |
+
use_hints=True,
|
39 |
+
enable_chat=False,
|
40 |
+
max_prompt_tokens=40_000,
|
41 |
+
be_cautious=True,
|
42 |
+
extra_instructions=None,
|
43 |
+
)
|
44 |
+
```
|
45 |
+
|
46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|