meghsn commited on
Commit
90d6776
·
1 Parent(s): 2a1e680

Added readme, visualwebarena

Browse files
app.py CHANGED
@@ -17,7 +17,7 @@ import re
17
  import html
18
  from typing import Dict, Any
19
 
20
- BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "AssistantBench"]
21
 
22
  def sanitize_agent_name(agent_name):
23
  # Only allow alphanumeric chars, hyphen, underscore
 
17
  import html
18
  from typing import Dict, Any
19
 
20
+ BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
21
 
22
  def sanitize_agent_name(agent_name):
23
  # Only allow alphanumeric chars, hyphen, underscore
results/GenericAgent-Claude-3.5-Sonnet/README.md CHANGED
@@ -41,4 +41,6 @@ BASE_FLAGS = GenericPromptFlags(
41
  be_cautious=True,
42
  extra_instructions=None,
43
  )
44
- ```
 
 
 
41
  be_cautious=True,
42
  extra_instructions=None,
43
  )
44
+ ```
45
+
46
+ Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "benchmark": "VisualWebArena",
6
+ "score": 21.0,
7
+ "std_err": 1.3,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/GenericAgent-GPT-4o-mini/visualwebarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "VisualWebArena",
7
+ "score": 16.9,
8
+ "std_err": 1.2,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/GenericAgent-GPT-4o/README.md CHANGED
@@ -1 +1,46 @@
1
- ## GPT-4o model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### GenericAgent-GPT-4o
2
+
3
+ This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
+
5
+ It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
+ ```python
7
+ BASE_FLAGS = GenericPromptFlags(
8
+ obs=dp.ObsFlags(
9
+ use_html=False,
10
+ use_ax_tree=True,
11
+ use_focused_element=True,
12
+ use_error_logs=True,
13
+ use_history=True,
14
+ use_past_error_logs=False,
15
+ use_action_history=True,
16
+ use_think_history=True, # gpt-4o config except for this line
17
+ use_diff=False,
18
+ html_type="pruned_html",
19
+ use_screenshot=False,
20
+ use_som=False,
21
+ extract_visible_tag=True,
22
+ extract_clickable_tag=True,
23
+ extract_coords="False",
24
+ filter_visible_elements_only=False,
25
+ ),
26
+ action=dp.ActionFlags(
27
+ multi_actions=False,
28
+ action_set="bid",
29
+ long_description=False,
30
+ individual_examples=False,
31
+ ),
32
+ use_plan=False,
33
+ use_criticise=False,
34
+ use_thinking=True,
35
+ use_memory=False,
36
+ use_concrete_example=True,
37
+ use_abstract_example=True,
38
+ use_hints=True,
39
+ enable_chat=False,
40
+ max_prompt_tokens=40_000,
41
+ be_cautious=True,
42
+ extra_instructions=None,
43
+ )
44
+ ```
45
+
46
+ Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
results/GenericAgent-GPT-4o/visualwebarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "VisualWebArena",
7
+ "score": 26.7,
8
+ "std_err": 1.5,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/GenericAgent-GPT-o1-mini/README.md CHANGED
@@ -1 +1,46 @@
1
- ## GPT-o1-mini model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### GenericAgent-GPT-o1-mini
2
+
3
+ This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
+
5
+ It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
+ ```python
7
+ BASE_FLAGS = GenericPromptFlags(
8
+ obs=dp.ObsFlags(
9
+ use_html=False,
10
+ use_ax_tree=True,
11
+ use_focused_element=True,
12
+ use_error_logs=True,
13
+ use_history=True,
14
+ use_past_error_logs=False,
15
+ use_action_history=True,
16
+ use_think_history=True, # gpt-4o config except for this line
17
+ use_diff=False,
18
+ html_type="pruned_html",
19
+ use_screenshot=False,
20
+ use_som=False,
21
+ extract_visible_tag=True,
22
+ extract_clickable_tag=True,
23
+ extract_coords="False",
24
+ filter_visible_elements_only=False,
25
+ ),
26
+ action=dp.ActionFlags(
27
+ multi_actions=False,
28
+ action_set="bid",
29
+ long_description=False,
30
+ individual_examples=False,
31
+ ),
32
+ use_plan=False,
33
+ use_criticise=False,
34
+ use_thinking=True,
35
+ use_memory=False,
36
+ use_concrete_example=True,
37
+ use_abstract_example=True,
38
+ use_hints=True,
39
+ enable_chat=False,
40
+ max_prompt_tokens=40_000,
41
+ be_cautious=True,
42
+ extra_instructions=None,
43
+ )
44
+ ```
45
+
46
+ Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
results/GenericAgent-Llama-3.1-405b/README.md CHANGED
@@ -1 +1,46 @@
1
- ### Llama-3.1-405B
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### GenericAgent-Llama-3.1-405b
2
+
3
+ This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
+
5
+ It uses Llama-3.1-405b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
+ ```python
7
+ BASE_FLAGS = GenericPromptFlags(
8
+ obs=dp.ObsFlags(
9
+ use_html=False,
10
+ use_ax_tree=True,
11
+ use_focused_element=True,
12
+ use_error_logs=True,
13
+ use_history=True,
14
+ use_past_error_logs=False,
15
+ use_action_history=True,
16
+ use_think_history=True, # gpt-4o config except for this line
17
+ use_diff=False,
18
+ html_type="pruned_html",
19
+ use_screenshot=False,
20
+ use_som=False,
21
+ extract_visible_tag=True,
22
+ extract_clickable_tag=True,
23
+ extract_coords="False",
24
+ filter_visible_elements_only=False,
25
+ ),
26
+ action=dp.ActionFlags(
27
+ multi_actions=False,
28
+ action_set="bid",
29
+ long_description=False,
30
+ individual_examples=False,
31
+ ),
32
+ use_plan=False,
33
+ use_criticise=False,
34
+ use_thinking=True,
35
+ use_memory=False,
36
+ use_concrete_example=True,
37
+ use_abstract_example=True,
38
+ use_hints=True,
39
+ enable_chat=False,
40
+ max_prompt_tokens=40_000,
41
+ be_cautious=True,
42
+ extra_instructions=None,
43
+ )
44
+ ```
45
+
46
+ Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
results/GenericAgent-Llama-3.1-70b/README.md CHANGED
@@ -1 +1,46 @@
1
- ### Llama-3.1-70B
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### GenericAgent-Llama-3.1-70b
2
+
3
+ This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
+
5
+ It uses Llama-3.1-70b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
+ ```python
7
+ BASE_FLAGS = GenericPromptFlags(
8
+ obs=dp.ObsFlags(
9
+ use_html=False,
10
+ use_ax_tree=True,
11
+ use_focused_element=True,
12
+ use_error_logs=True,
13
+ use_history=True,
14
+ use_past_error_logs=False,
15
+ use_action_history=True,
16
+ use_think_history=True, # gpt-4o config except for this line
17
+ use_diff=False,
18
+ html_type="pruned_html",
19
+ use_screenshot=False,
20
+ use_som=False,
21
+ extract_visible_tag=True,
22
+ extract_clickable_tag=True,
23
+ extract_coords="False",
24
+ filter_visible_elements_only=False,
25
+ ),
26
+ action=dp.ActionFlags(
27
+ multi_actions=False,
28
+ action_set="bid",
29
+ long_description=False,
30
+ individual_examples=False,
31
+ ),
32
+ use_plan=False,
33
+ use_criticise=False,
34
+ use_thinking=True,
35
+ use_memory=False,
36
+ use_concrete_example=True,
37
+ use_abstract_example=True,
38
+ use_hints=True,
39
+ enable_chat=False,
40
+ max_prompt_tokens=40_000,
41
+ be_cautious=True,
42
+ extra_instructions=None,
43
+ )
44
+ ```
45
+
46
+ Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).