Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

browsergym-leaderboard / app.py

meghsn

Result updates

d5581cc about 2 months ago

raw

history blame

19.3 kB

	import json
	import re
	import os
	import streamlit as st
	import requests
	import pandas as pd
	from io import StringIO
	import plotly.graph_objs as go
	from huggingface_hub import HfApi
	from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
	import streamlit.components.v1 as components
	from datetime import datetime

	from urllib.parse import quote
	from pathlib import Path
	import re
	import html
	from typing import Dict, Any

	BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "AssistantBench"]

	def sanitize_agent_name(agent_name):
	# Only allow alphanumeric chars, hyphen, underscore
	if agent_name.startswith('.'):
	raise ValueError("Agent name cannot start with a dot")

	if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
	raise ValueError("Invalid agent name format")
	return agent_name

	def safe_path_join(*parts):
	# Ensure we stay within results directory
	base = Path("results").resolve()
	try:
	path = base.joinpath(*parts).resolve()
	if not str(path).startswith(str(base)):
	raise ValueError("Path traversal detected")
	return path
	except Exception:
	raise ValueError("Invalid path")

	def sanitize_column_name(col: str) -> str:
	"""Sanitize column names for HTML display"""
	return html.escape(str(col))

	def sanitize_cell_value(value: Any) -> str:
	if isinstance(value, (int, float)):
	return str(value)
	if isinstance(value, str) and '±' in value:
	score, std_err = value.split('±')
	return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
	return html.escape(str(value))

	def create_html_table_main(df):
	col1, col2 = st.columns([2,6])
	with col1:
	sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
	with col2:
	sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")

	def get_sort_value(row):
	if row == "-":
	return float('-inf')
	else:
	try:
	return float(row)
	except ValueError:
	return row

	# Sort dataframe
	if sort_order == "Ascending":
	df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
	else:
	df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
	html = '''
	<style>
	table {
	width: 100%;
	border-collapse: collapse;
	}
	th, td {
	border: 1px solid #ddd;
	padding: 8px;
	text-align: center;
	}
	th {
	font-weight: bold;
	}
	.table-container {
	padding-bottom: 20px;
	}
	</style>
	'''
	html += '<div class="table-container">'
	html += '<table>'
	html += '<thead><tr>'
	for column in df.columns:
	html += f'<th>{sanitize_column_name(column)}</th>'
	html += '</tr></thead>'
	html += '<tbody>'
	for _, row in df.iterrows():
	html += '<tr>'
	for col in df.columns:
	if col == "Agent":
	html += f'<td>{row[col]}</td>'
	else:
	html += f'<td>{sanitize_cell_value(row[col])}</td>'
	html += '</tr>'
	html += '</tbody></table>'
	html += '</div>'
	return html

	def create_html_table_benchmark(df, benchmark):
	col1, col2 = st.columns([2,6])
	with col1:
	sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
	with col2:
	sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")

	def get_sort_value(row):
	if row == "-":
	return float('-inf')
	else:
	try:
	return float(row)
	except ValueError:
	return row

	# Sort dataframe
	if sort_order == "Ascending":
	df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
	else:
	df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))

	html = '''
	<style>
	table {
	width: 100%;
	border-collapse: collapse;
	}
	th, td {
	border: 1px solid #ddd;
	padding: 8px;
	text-align: center;
	}
	th {
	font-weight: bold;
	}
	.table-container {
	padding-bottom: 20px;
	}
	</style>
	'''
	html += '<div class="table-container">'
	html += '<table>'
	html += '<thead><tr>'
	for column in df.columns:
	if column == "Reproduced_all" or column == "std_err":
	continue
	html += f'<th>{sanitize_column_name(column)}</th>'
	html += '</tr></thead>'
	html += '<tbody>'
	for _, row in df.iterrows():
	html += '<tr>'
	for column in df.columns:
	if column == "Reproduced":
	if row[column] == "-":
	html += f'<td>{sanitize_cell_value(row[column])}</td>'
	else:
	summary = sanitize_cell_value(row[column])
	details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
	html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
	elif column == "Reproduced_all" or column == "std_err":
	continue
	elif column == "Score":
	score_with_std_err = f'{row[column]} ± {row["std_err"]}'
	html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
	else:
	html += f'<td>{sanitize_cell_value(row[column])}</td>'
	html += '</tr>'
	html += '</tbody></table>'
	html += '</div>'
	return html

	def check_sanity(agent):
	try:
	safe_agent = sanitize_agent_name(agent)
	for benchmark in BENCHMARKS:
	file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
	if not file_path.is_file():
	continue
	original_count = 0
	with open(file_path) as f:
	results = json.load(f)
	for result in results:
	if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
	return False
	if result["agent_name"] != agent:
	return False
	if result["benchmark"] != benchmark:
	return False
	if result["original_or_reproduced"] == "Original":
	original_count += 1
	if original_count != 1:
	return False
	return True
	except ValueError:
	return False

	def main():
	st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
	st.markdown("""
	<style>
	:root {
	--lighter-color: #888; /* Default for light theme */
	}
	@media (prefers-color-scheme: dark) {
	:root {
	--lighter-color: #ccc; /* Default for dark theme */
	}
	}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("""
	<head>
	<meta http-equiv="Content-Security-Policy"
	content="default-src 'self' https://huggingface.co;
	script-src 'self' 'unsafe-inline';
	style-src 'self' 'unsafe-inline';
	img-src 'self' data: https:;
	frame-ancestors 'none';">
	<meta http-equiv="X-Frame-Options" content="DENY">
	<meta http-equiv="X-Content-Type-Options" content="nosniff">
	<meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin">
	</head>
	""", unsafe_allow_html=True)

	all_agents = os.listdir("results")
	all_results = {}
	for agent in all_agents:
	if not check_sanity(agent):
	st.error(f"Results for {agent} are not in the correct format.")
	continue
	agent_results = []
	for benchmark in BENCHMARKS:
	file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
	if not file_path.is_file():
	continue
	with open(file_path) as f:
	agent_results.extend(json.load(f))
	all_results[agent] = agent_results

	st.title("🏆 BrowserGym Leaderboard")
	st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
	# content = create_yall()
	tabs = st.tabs(["🏆 Main Leaderboard",] + BENCHMARKS + ["📝 About"])

	with tabs[0]:
	# Leaderboard tab
	def get_leaderboard_dict(results):
	leaderboard_dict = []
	for key, values in results.items():
	result_dict = {"Agent": key}
	for benchmark in BENCHMARKS:
	if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
	result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
	else:
	result_dict[benchmark] = "-"
	leaderboard_dict.append(result_dict)
	return leaderboard_dict
	leaderboard_dict = get_leaderboard_dict(all_results)
	# print (leaderboard_dict)
	full_df = pd.DataFrame.from_dict(leaderboard_dict)

	df = pd.DataFrame(columns=full_df.columns)
	dfs_to_concat = []
	dfs_to_concat.append(full_df)

	# Concatenate the DataFrames
	if dfs_to_concat:
	df = pd.concat(dfs_to_concat, ignore_index=True)

	for benchmark in BENCHMARKS:
	df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
	df[benchmark] = df[benchmark].astype(str)
	# Add a search bar
	search_query = st.text_input("Search agents", "", key="search_main")

	# Filter the DataFrame based on the search query
	if search_query:
	df = df[df['Agent'].str.contains(search_query, case=False)]

	# Display the filtered DataFrame or the entire leaderboard

	def make_hyperlink(agent_name):
	try:
	safe_name = sanitize_agent_name(agent_name)
	safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
	return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>'
	except ValueError:
	return ""

	df['Agent'] = df['Agent'].apply(make_hyperlink)
	html_table = create_html_table_main(df)
	st.markdown(html_table, unsafe_allow_html=True)

	if st.button("Export to CSV", key="export_main"):
	# Export the DataFrame to CSV
	csv_data = df.to_csv(index=False)

	# Create a link to download the CSV file
	st.download_button(
	label="Download CSV",
	data=csv_data,
	file_name="leaderboard.csv",
	key="download-csv",
	help="Click to download the CSV file",
	)

	with tabs[-1]:
	st.markdown('''
	# BrowserGym Leaderboard

	This leaderboard tracks performance of various agents on web navigation tasks.

	## How to Submit Results for New Agents

	### 1. Create Results Directory
	Create a new folder in the `results` directory with your agent's name:
	```bash
	results/
	└── your-agent-name/
	├── README.md
	├── webarena.json
	├── workarena-l1.json
	├── workarena++-l2.json
	├── workarena++-l3.json
	└── miniwob.json
	```


	### 2. Add Agent Details

	Create a `README.md` in your agent's folder with the following details:

	#### Required Information
	- Model Name: Base model used (e.g., GPT-4, Claude-2)
	- Model Architecture: Architecture details and any modifications
	- Input/Output Format: How inputs are processed and outputs generated
	- Training Details: Training configuration if applicable
	- Dataset used
	- Number of training steps
	- Hardware used
	- Training time

	#### Optional Information
	- Paper Link: Link to published paper/preprint if available
	- Code Repository: Link to public code implementation
	- Additional Notes: Any special configurations or requirements
	- License: License information for your agent

	Make sure to organize the information in clear sections using Markdown.

	### 3. Add Benchmark Results

	Create separate JSON files for each benchmark following this format:

	```json
	[
	{
	"agent_name": "your-agent-name",
	"study_id": "unique-study-identifier-from-agentlab",
	"date_time": "YYYY-MM-DD HH:MM:SS",
	"benchmark": "WebArena",
	"score": 0.0,
	"std_err": 0.0,
	"benchmark_specific": "Yes/No",
	"benchmark_tuned": "Yes/No",
	"followed_evaluation_protocol": "Yes/No",
	"reproducible": "Yes/No",
	"comments": "Additional details",
	"original_or_reproduced": "Original"
	}
	]
	```

	Please add all the benchmark files in separate json files named as follows:

	- `webarena.json`
	- `workarena-l1.json`
	- `workarena-l2.json`
	- `workarena-l3.json`
	- `miniwob.json`

	Each file must contain a JSON array with a single object following the format above. The benchmark field in each file must match the benchmark name exactly ([`WebArena`, `WorkArena-L1`, `WorkArena-L2`, `WorkArena-L3`, `MiniWoB`]) and benchmark_lowercase.json as the filename.

	### 4. Submit PR

	1. Open the community tab and press "New Pull Request"
	2. Give it a new title to the PR and follow the steps mentioned
	3. Publish the branch

	## How to Submit Reproducibility Results for Existing Agents

	Open the results file for the agent and benchmark you reproduced the results for.

	### 1. Add reproduced results


	Append the following entry in the json file. Ensure you set `original_or_reproduced` as `Reproduced`.

	```json
	[
	{
	"agent_name": "your-agent-name",
	"study_id": "unique-study-identifier-from-agentlab",
	"date_time": "YYYY-MM-DD HH:MM:SS",
	"benchmark": "WebArena",
	"score": 0.0,
	"std_err": 0.0,
	"benchmark_specific": "Yes/No",
	"benchmark_tuned": "Yes/No",
	"followed_evaluation_protocol": "Yes/No",
	"reproducible": "Yes/No",
	"comments": "Additional details",
	"original_or_reproduced": "Reproduced"
	}
	]
	```

	### 2. Submit PR

	1. Open the community tab and press "New Pull Request"
	2. Give it a new title to the PR and follow the steps mentioned
	3. Publish the branch

	## License

	MIT
	''')
	for i, benchmark in enumerate(BENCHMARKS, start=1):
	with tabs[i]:
	def get_benchmark_dict(results, benchmark):
	benchmark_dict = []
	for key, values in results.items():
	result_dict = {"Agent": key}
	flag = 0
	for value in values:
	if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
	result_dict["Score"] = value["score"]
	result_dict["std_err"] = value["std_err"]
	result_dict["Benchmark Specific"] = value["benchmark_specific"]
	result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
	result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
	result_dict["Reproducible"] = value["reproducible"]
	result_dict["Comments"] = value["comments"]
	result_dict["Study ID"] = value["study_id"]
	value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
	result_dict["Date"] = value["date_time"]
	result_dict["Reproduced"] = []
	result_dict["Reproduced_all"] = []
	flag = 1
	if not flag:
	result_dict["Score"] = "-"
	result_dict["std_err"] = "-"
	result_dict["Benchmark Specific"] = "-"
	result_dict["Benchmark Tuned"] = "-"
	result_dict["Followed Evaluation Protocol"] = "-"
	result_dict["Reproducible"] = "-"
	result_dict["Comments"] = "-"
	result_dict["Study ID"] = "-"
	result_dict["Date"] = "-"
	result_dict["Reproduced"] = []
	result_dict["Reproduced_all"] = []
	if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
	result_dict["Reproduced"].append(value["score"])
	value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
	result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
	if result_dict["Reproduced"]:
	result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
	else:
	result_dict["Reproduced"] = "-"
	benchmark_dict.append(result_dict)
	return benchmark_dict
	benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
	# print (leaderboard_dict)
	full_df = pd.DataFrame.from_dict(benchmark_dict)
	df_ = pd.DataFrame(columns=full_df.columns)
	dfs_to_concat = []
	dfs_to_concat.append(full_df)

	# Concatenate the DataFrames
	if dfs_to_concat:
	df_ = pd.concat(dfs_to_concat, ignore_index=True)
	df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
	df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
	df_['Score'] = df_['Score'].astype(str)
	html_table = create_html_table_benchmark(df_, benchmark)
	st.markdown(html_table, unsafe_allow_html=True)


	if __name__ == "__main__":
	main()