L2C-Eval / app.py
niansong1996
wip
917fc52
raw
history blame
11.6 kB
import json
import math
import random
import streamlit as st
import pandas as pd
import time
import numpy as np
# define some constants
CODE_LLM = "Codex"
DEFAULT_FIRST_EXAMPLE_IDX = 47
MAX_STAGE = 5
DEFAULT_TOP_K_EXAMPLES = 10
DATASET_NAMES = ["Spider", "WikiTQ", "GSM8k", "MBPP"]
RESULT_FILES_DICTS = {
"Spider": "demo-spider-codex-results.jsonl",
"WikiTQ": "demo-wikitq-codex-results.jsonl",
"GSM8k": "demo-gsm8k-codex-results.jsonl",
"MBPP": "demo-mbpp-codex-results.jsonl"
}
N_MODELS = 54
N_ORGS = 13
#################### Setups must go first ####################
st.set_page_config(layout="wide")
#################### Side Bar ####################
with st.sidebar:
st.markdown("# About")
st.markdown("**L2CEval** is a framework for evaluating Language-to-Code generation for LLMs.")
# st.info("**Site under construction**")
st.warning("**Interactive visualizer (coming soon!)**")
st.warning("**Model output explorer (coming soon!)**")
# with st.expander(":blue[**Authors**]", expanded=False):
# st.markdown("**Ansong Ni$^†$, Pengcheng Yin$^♣$, Yilun Zhao$^†$, Martin Riddell$^†$, Troy Feng$^†$, Rui Shen$^†$, Stephen Yin$^†$, Ye Liu$^β™’$, Semih Yavuz$^β™’$, " \
# "Caiming Xiong$^β™’$, Shafiq Joty$^β™’$, Yingbo Zhou$^β™’$, Dragomir Radev$^†$, Arman Cohan$^†‑$**")
# st.markdown("**†: Yale University, ♣: Google DeepMind, β™’: Salesforce Research, ‑: Allen Institute for AI**")
# st.markdown("**Authors**: Ansong Ni, Srini Iyer, Dragomir Radev, Ves Stoyanov, Wen-tau Yih, Sida I. Wang*, Xi Victoria Lin*")
# st.markdown("**Demo made by**: [Ansong Ni](https://niansong1996.github.io/)")
# st.markdown("**All experiment code on [GitHub](https://github.com/niansong1996/lever)**")
#################### START OF DEMO ####################
# some basic intro
st.image("images/l2ceval-logo.png", use_column_width="auto")
st.markdown("### L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models")
st.markdown("**Ansong Ni$^†$, Pengcheng Yin$^♣$, Yilun Zhao$^†$, Martin Riddell$^†$, Troy Feng$^†$, Rui Shen$^†$, Stephen Yin$^†$**")
st.markdown("**Ye Liu$^β™’$, Semih Yavuz$^β™’$, " \
"Caiming Xiong$^β™’$, Shafiq Joty$^β™’$, Yingbo Zhou$^β™’$, Dragomir Radev$^†$, Arman Cohan$^†‑$**")
st.markdown("†: Yale University, ♣: Google DeepMind, β™’: Salesforce Research, ‑: Allen Institute for AI")
st.warning(":orange[**Site under construction πŸ› οΈ... Stay tuned!**]")
st.divider()
# st.markdown("#### Abstract")
# st.markdown("""
# Recently, large language models (LLMs), especially those that are pretrained
# on code, have demonstrated strong capabilities in generating programs from
# natural language inputs in a few-shot or even zero-shot manner. Despite
# promising results, there is a notable lack of a comprehensive evaluation of
# these models language-to-code generation capabilities. Existing studies often
# focus on specific tasks, model architectures, or learning paradigms, leading to
# a fragmented understanding of the overall landscape. In this work, we present
# L2CEval, a systematic evaluation of the language-to-code generation
# capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing,
# math reasoning and Python programming, analyzing the factors that potentially
# affect their performance, such as model size, pretraining data, instruction
# tuning, and different prompting methods. In addition to assessing model
# performance, we measure confidence calibration for the models and conduct human
# evaluations of the output programs. This enables us to identify and analyze the
# typical failure modes across various tasks and models. L2CEval offers a
# comprehensive understanding of the capabilities and limitations of LLMs in
# language-to-code generation. We also release the evaluation framework and all
# model outputs, hoping to lay the groundwork for further future research in this
# domain.
# """)
st.markdown("#### Language-to-Code (L2C) Generation")
st.markdown("Langauge-to-Code (L2C) generation is a type of tasks that maps from natural language to code. It is " \
"the cornerstone of many applications in AI, such as 1) chatbots; 2) coding assistants; " \
"3) language interfaces for databases; 4) robotic control; etc")
st.image("images/pipeline.png", caption="Example of L2C tasks", use_column_width="auto")
st.divider()
st.markdown("#### L2CEval - Tasks")
st.markdown("We evaluate the L2C capabilities of LLMs on 7 tasks across the domain spectrum of *semantic parsing*, \
*math reasoning* and *Python programming*:")
st.markdown("""
| Domain | Dataset | Split | Size | Input | Output |
|------------------|--------------------------------------------------------------|-------|--------|---------------------------------|------------------------|
| Semantic Parsing | [Spider (Yu et al., 2018)](https://yale-lily.github.io/spider)| Dev | 1,000 | DB schema + NL | SQL Query |
| | [WikiTQ (Pasupat and Liang, 2015)](https://ppasupat.github.io/WikiTableQuestions) | Dev | 2,828 | Table headers + NL | SQL Query |
| Math Reasoning | [GSM8k (Cobbe et al., 2021)](https://github.com/openai/grade-school-math) | All | 1,494 | Math problem in NL | Python solution |
| | [SVAMP (Patel et al., 2021)](https://github.com/arkilpatel/SVAMP) | All | 996 | Math problem in NL | Python solution |
| Python Programming| [MBPP (Austin et al., 2021)](https://github.com/google-research/google-research/blob/master/mbpp/README.md) | Test | 500 | NL spec. + 1 test | Python function |
| | [HumanEval (Chen et al., 2021)](https://github.com/openai/human-eval) | All | 164 | NL spec. + 1-3 test | Python function |
| | [DS-1000 (Lai et al., 2022)](https://github.com/HKUNLP/DS-1000) | All | 1000 | NL spec. | Python lines |
""")
st.divider()
st.markdown("#### L2CEval - Models")
st.markdown(f"We evaluate {N_MODELS} models from {N_ORGS} organizations. Here is a summary of the *open-source* models we evaluated:")
st.markdown("""
| Organization | Model Name | Release Time | Sizes | # All Tokens | # Code Tokens | Ctx. Leng. | Code Specific | Inst. Tuned |
|-------------------|--------------------------|--------------|----------------|--------------|---------------|------------|---------------|-------------|
| Salesforce | CodeGen-multi | 2022-3 | 6.1/16.1B | 505B | 119B | 2,048 | βœ“ | βœ— |
| Salesforce | CodeGen-mono | 2022-3 | 6.1/16.1B | 577B | 191B | 2,048 | βœ“ | βœ— |
| Salesforce | CodeGen-2.5-multi | 2023-7 | 7B | 1.4T | 1.4T | 2,048 | βœ“ | βœ— |
| Salesforce | CodeGen-2.5-mono | 2023-7 | 7B | - | - | 2,048 | βœ“ | βœ— |
| Salesforce | CodeGen-2.5-instruct | 2023-7 | 7B | - | - | 2,048 | βœ“ | βœ“ |
| Eleuther AI | GPT-J | 2021-5 | 6.1B | 402B | 46B | 2,048 | βœ— | βœ— |
| Eleuther AI | GPT-NeoX | 2022-4 | 20.6B | 472B | 54B | 2,048 | βœ— | βœ— |
| Eleuther AI | Pythia | 2023-4 | 1.4/6.9/12B | 300B | 35B | 2,048 | βœ— | βœ— |
| Databricks | Dolly-v2 | 2023-4 | 6.9/12B | - | - | 2,048 | βœ— | βœ“ |
| BigCode | SantaCoder | 2023-1 | 1.1B | 236B | 236B | 2,048 | βœ“ | βœ— |
| BigCode | StarCoder | 2023-5 | 15.5B | 1T | 1T | 8,192 | βœ“ | βœ— |
| BigCode | StarCoderPlus | 2023-6 | 15.5B | 1.6T | 1T | 8,192 | βœ“ | βœ— |
| Meta AI | InCoder | 2022-4 | 1.3/6.7B | 52B | 52B | 2,048 | βœ“ | βœ— |
| Meta AI | LLaMA | 2023-2 | 6.7/13B | 1T | 45B | 2,048 | βœ— | βœ— |
| Meta AI | LLaMA-30B | 2023-2 | 32.5B | 1.4T | 63B | 2,048 | βœ— | βœ— |
| Meta AI | LLaMA-2 | 2023-7 | 7/13/70B | 2T | - | 4,096 | βœ— | βœ— |
| Meta AI | CodeLLaMA | 2023-7 | 7/13/34B | 2.5T | 435B | 16,384 | βœ“ | βœ— |
| Stanford | Alpaca | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | βœ— | βœ“ |
| LMSYS | Vincuna | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | βœ— | βœ— |
| Replit | Replit-code-v1-3b | 2023-5 | 2.7B | 525B | 525B | 2,048 | βœ“ | βœ— |
| MosaicML | MPT-7B | 2023-5 | 7B | 1T | 135B | 2,048 | βœ— | βœ— |
| MosaicML | MPT-7B-instruct | 2023-5 | 7B | - | - | 2,048 | βœ— | βœ“ |
| MosaicML | MPT-30B | 2023-6 | 30B | 1T | 135B | 8,192 | βœ— | βœ— |
| MosaicML | MPT-30B-instruct | 2023-6 | 30B | - | - | 8,192 | βœ— | βœ“ |
""")
st.markdown("\n\n\n\n")
st.markdown("In addition, we also evaluated the following *proprietary* models:")
st.markdown("""
- OpenAI GPT-4
- OpenAI GPT-3.5-turbo
- OpenAI text-davinci-002
- OpenAI text-davinci-003
- OpenAI code-davinci-002
- OpenAI code-cushman-001
""")
st.divider()
# read results from csv
# results = pd.read_csv("data/scatter.csv")
# st.info(results.to_markdown())
# st.info(results.columns)
# st.info(results.dtypes)
# st.scatter_chart(
# results,
# x="Model Size",
# y="Avg. Perf.",
# color='Model Series',
# # size='Avg. Perf.',
# )
# chart_data = pd.DataFrame(np.random.randn(20, 3), columns=["col1", "col2", "col3"])
# chart_data['col4'] = np.random.choice(['A','B','C'], 20)
# st.info(chart_data.to_markdown())
# st.info(chart_data.dtypes)
# st.scatter_chart(
# chart_data,
# x='col1',
# y='col2',
# color='col4',
# size='col3',
# )
st.markdown("#### All Results (coming soon!)")
# st.image("images/all_results.png", use_column_width="auto")