qiantong-xu's picture
add codellama results (#3)
d38ed8c
raw
history blame
10.3 kB
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
import gradio as gr
import pandas as pd
COLUMN_NAMES = ["Model", "Tuned on ToolBench", "Avg.", "Open Weather", "The Cat API", "Home Search", "Trip Booking", "Google Sheets", "VirtualHome", "WebShop Long", "WebShop Short", "Tabletop"]
UNTUNED_MODEL_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4) & 93.0 & 96.0 & 97.0 & 96.7 & 62.9 & 23.0 / 23.5 & 0.0 & 0.0 & 81.0 \\
[text-davinci-003](https://platform.openai.com/docs/models/gpt-3) & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\
[gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0 & 1.8 & 33.3 \\
[text-curie-001](https://platform.openai.com/docs/models/gpt-3) & 8.0 & 58.0 & 6.0 & 6.7 & 1.4 & 12.0 / 4.1 & 0.0 & 0.0 & 1.0 \\
[Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) & 90.0 & 84.39 & 83.0 & 71.67 & 58.57 & 35.0 / 24.74 & 1.53 & 30.45 & 45.4 \\
[Llama-2-13b](https://huggingface.co/meta-llama/Llama-2-13b) & 85.0 & 77.0 & 68.0 & 53.33 & 30.0 & 33.0 / 21.67 & 0.6 & 31.67 & 23.81 \\
[Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) & 76.0 & 83.0 & 58.0 & 33.33 & 22.86 & 25.0 / 21.49 & 0.0 & 6.92 & 14.39
[llama-65b](https://huggingface.co/huggyllama/llama-65b) & 90.0 & 80.0 & 84.0 & 65.8 & 32.9 & 32.0 / 20.3 & 0.0 & 41.2 & 30.5 \\
[llama-30b](https://huggingface.co/huggyllama/llama-30b) & 78.0 & 84.0 & 66.0 & 45.0 & 37.1 & 27.0 / 21.7 & 0.0 & 30.6 & 34.3 \\
[llama-13b](https://huggingface.co/huggyllama/llama-13b) & 70.0 & 74.0 & 45.0 & 35.8 & 5.7 & 28.0 / 18.9 & 0.0 & 27.6 & 17.1 \\
[llama-13b-alpaca](https://huggingface.co/chavinlo/gpt4-x-alpaca) & 62.0 & 43.0 & 44.0 & 40.8 & 11.4 & 1.0 / 1.6 & 0.0 & 2.7 & 9.5 \\
[CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) & 86.0 & 92.0 & 74.0 & 63.33 & 38.08 & 35.0 / 21.97 & 0.0 & 0.0 & 11.16 \\
[CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) & 90.0 & 94.0 & 78.0 & 61.67 & 41.27 & 32.0 / 21.95 & 0.0 & 0.0 & 16.98 \\
[CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf) & 83.0 & 88.0 & 83.0 & 68.33 & 49.13 & 31.0 / 21.33 & 0.0 & 1.58 & 22.86 \\
[CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) & 96.0 & 86.83 & 87.0 & 70.83 & 51.26 & 35.0 / 22.28 & 0.0 & 0.0 & 40.95 \\
[CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) & 98.0 & 89.58 & 85.0 & 72.5 & 48.97 & 31.0 / 22.56 & 0.0 & 9.7 & 57.62 \\
[CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) & 93.0 & 92.35 & 86.0 & 62.5 & 50.79 & 37.0 / 21.94 & 0.0 & 2.4 & 41.53 \\
[CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) & 96.39 & 85.0 & 88.0 & 88.33 & 64.29 & 34.0 / 24.65 & 0.0 & 5.53 & 51.32 \\
[CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) & 94.28 & 86.0 & 90.0 & 89.17 & 61.11 & 31.0 / 24.34 & 0.0 & 25.99 & 47.46 \\
[CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) & 91.11 & 88.42 & 91.0 & 85.83 & 55.87 & 28.0 / 21.24 & 0.0 & 6.47 & 33.33 \\
[starcoder](https://huggingface.co/bigcode/starcoder) & 91.0 & 84.0 & 82.0 & 51.7 & 48.0 & 23.0 / 19.4 & 2.6 & 0.0 & 21.9 \\
[starcoderbase](https://huggingface.co/bigcode/starcoderbase) & 90.0 & 86.0 & 79.0 & 63.3 & 42.9 & 24.0 / 16.3 & 5.8 & 23.1 & 17.1 \\
[codegen-16B-nl](https://huggingface.co/Salesforce/codegen-16B-nl) & 51.0 & 75.0 & 37.0 & 21.7 & 7.1 & 43.0 / 18.0 & 0.0 & 0.0 & 16.2 \\
[codegen-16B-multi](https://huggingface.co/Salesforce/codegen-16B-multi) & 56.0 & 75.0 & 47.0 & 7.5 & 21.4 & 31.0 / 14.1 & 0.0 & 0.5 & 8.6 \\
[codegen-16B-mono](https://huggingface.co/Salesforce/codegen-16B-mono) & 63.7 & 72.0 & 52.0 & 28.3 & 31.5 & 28.0 / 15.7 & 1.5 & 6.6 & 15.2 \\
[bloomz](https://huggingface.co/bigscience/bloomz) & 58.0 & 85.0 & 36.0 & 22.5 & 14.3 & 9.0 / 4.9 & 0.0 & 1.0 & 1.0 \\
[opt-iml-30b](https://huggingface.co/facebook/opt-iml-30b) & 44.0 & 48.0 & 5.0 & 3.3 & 2.9 & 13.0 / 8.3 & 0.0 & 0.0 & 1.0 \\
[opt-30b](https://huggingface.co/facebook/opt-30b) & 46.0 & 35.0 & 2.0 & 3.3 & 8.6 & 24.0 / 11.7 & 0.0 & 0.0 & 1.0 \\
[opt-iml-1.3b](https://huggingface.co/facebook/opt-iml-1.3b) & 20.0 & 28.0 & 0.0 & 0.0 & 4.3 & 13.0 / 3.1 & 0.0 & 0.0 & 1.0 \\
[opt-1.3b](https://huggingface.co/facebook/opt-1.3b) & 18.0 & 30.0 & 0.0 & 0.0 & 1.4 & 31.0 / 9.7 & 0.0 & 0.0 & 1.0 \\
[neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) & 55.0 & 69.0 & 27.0 & 10.8 & 18.6 & 28.0 / 15.3 & 0.0 & 8.8 & 6.7 \\
[GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B) & 43.0 & 73.0 & 28.0 & 10.8 & 4.3 & 26.0 / 13.1 & 0.0 & 0.7 & 7.6 \\
[pythia-12b](https://huggingface.co/EleutherAI/pythia-12b) & 53.0 & 65.0 & 12.0 & 0.8 & 11.4 & 17.0 / 12.1 & 0.0 & 0.0 & 1.9 \\
[dolly-v2-12b]() & 0.0 & 1.0 & 10.0 & 5.0 & 7.1 & 11.0 / 8.9 & 0.0 & 0.0 & 7.6 \\
[pythia-6.9b](https://huggingface.co/EleutherAI/pythia-6.9b) & 41.0 & 72.0 & 8.0 & 7.5 & 4.3 & 29.0 / 14.0 & 0.0 & 0.0 & 8.6 \\
[pythia-2.8b](https://huggingface.co/EleutherAI/pythia-2.8b) & 49.0 & 54.0 & 7.0 & 3.3 & 12.9 & 24.0 / 14.8 & 0.0 & 0.0 & 7.6 \\
[pythia-1.4b](https://huggingface.co/EleutherAI/pythia-1.4b) & 37.0 & 48.0 & 4.0 & 5.0 & 10.0 & 22.0 / 10.7 & 0.0 & 5.2 & 7.6 \\
[stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) & 22.0 & 47.0 & 0.0 & 0.0 & 4.3 & 28.0 / 10.3 & 0.0 & 0.0 & 2.9 \\
[stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) & 23.0 & 38.0 & 0.0 & 0.0 & 1.4 & 26.0 / 7.3 & 0.0 & 0.0 & 3.8 \\
[stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) & 6.0 & 28.0 & 0.0 & 0.0 & 1.4 & 29.0 / 5.3 & 0.0 & 0.0 & 1.0 \\
[stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) & 14.0 & 31.0 & 0.0 & 0.8 & 0.0 & 8.0 / 5.6 & 0.0 & 0.0 & 1.0 \\'''
TUNED_MODEL_RESULTS='''[llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench) & 100.0 & 94.0 & 87.0 & 85.8 & 2.9 & 16.0/ 24.3& 0.0 & 0.0 & 7.5 \\
[starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench) & 99.0 & 97.0 & 83.0 & 80.8 & 21.2 & 31.0/ 18.4& 0.0 & 0.0 & 13.9 \\
[codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
def parse_line(line):
model_results = line.replace(" ", "").strip("\\").split("&")
for i in range(1, len(model_results)):
if i == 6:
res = model_results[6].split('/')[-1].strip()
else:
res = model_results[i]
model_results[i] = float(res)
return model_results
def get_baseline_df():
df_data = []
lines = UNTUNED_MODEL_RESULTS.split("\n")
for line in lines:
model_results = parse_line(line)
assert len(model_results) == 10
avg = sum(model_results[1:-3] + model_results[-2:]) / 8
model_results.insert(1, avg)
model_results.insert(1, "False")
df_data.append(model_results)
lines = TUNED_MODEL_RESULTS.split("\n")
for line in lines:
model_results = parse_line(line)
assert len(model_results) == 10
avg = sum(model_results[1:-3] + model_results[-2:]) / 8
model_results.insert(1, avg)
model_results.insert(1, "True")
df_data.append(model_results)
print(len(df_data))
df = pd.DataFrame(df_data, columns=COLUMN_NAMES).round(1)
return df
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{xu2023tool,
title={On the Tool Manipulation Capability of Open-source Large Language Models},
author={Qiantong Xu and Fenglu Hong and Bo Li and Changran Hu and Zhengyu Chen and Jian Zhang},
year={2023},
eprint={2305.16504},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
}"""
block = gr.Blocks()
with block:
gr.Markdown(
"""# Toolbench Leaderboard
Welcome to the leaderboard of the ToolBench! 🏆
This is a community where participants create language models and action generation algorithms to generate API function calls based goals described in natural lanugage!
Please refer to [our paper](https://arxiv.org/abs/2305.16504) for more details and join our [Discord](https://discord.com/invite/JehFG5HXKb) for further discussion.
The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github.
"""
)
with gr.Row():
with gr.Accordion("Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
).style(show_copy_button=True)
gr.Markdown(
"""In the table below, we summarize the 3-shot performance of all the models.
We use success rate as the primary evaluation metric for most tasks, except that we report rewards on WebShop, and the Longest Common Subsequence (LCS) on VirtualHome, following the original metrics proposed by the respective authors.
"""
)
with gr.Row():
data = gr.components.Dataframe(
type="pandas", datatype=["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
)
with gr.Row():
data_run = gr.Button("Refresh")
data_run.click(
get_baseline_df, outputs=data
)
block.load(get_baseline_df, outputs=data)
block.launch()