Spaces:
Runtime error
Runtime error
import json | |
import math | |
import random | |
import streamlit as st | |
import pandas as pd | |
import time | |
import numpy as np | |
# define some constants | |
CODE_LLM = "Codex" | |
DEFAULT_FIRST_EXAMPLE_IDX = 47 | |
MAX_STAGE = 5 | |
DEFAULT_TOP_K_EXAMPLES = 10 | |
DATASET_NAMES = ["Spider", "WikiTQ", "GSM8k", "MBPP"] | |
RESULT_FILES_DICTS = { | |
"Spider": "demo-spider-codex-results.jsonl", | |
"WikiTQ": "demo-wikitq-codex-results.jsonl", | |
"GSM8k": "demo-gsm8k-codex-results.jsonl", | |
"MBPP": "demo-mbpp-codex-results.jsonl" | |
} | |
N_MODELS = 54 | |
N_ORGS = 13 | |
#################### Setups must go first #################### | |
st.set_page_config(layout="wide") | |
#################### Side Bar #################### | |
with st.sidebar: | |
st.markdown("# About") | |
st.markdown("**L2CEval** is a framework for evaluating Language-to-Code generation for LLMs.") | |
# st.info("**Site under construction**") | |
st.warning("**Interactive visualizer (coming soon!)**") | |
st.warning("**Model output explorer (coming soon!)**") | |
# with st.expander(":blue[**Authors**]", expanded=False): | |
# st.markdown("**Ansong Ni$^β $, Pengcheng Yin$^β£$, Yilun Zhao$^β $, Martin Riddell$^β $, Troy Feng$^β $, Rui Shen$^β $, Stephen Yin$^β $, Ye Liu$^β’$, Semih Yavuz$^β’$, " \ | |
# "Caiming Xiong$^β’$, Shafiq Joty$^β’$, Yingbo Zhou$^β’$, Dragomir Radev$^β $, Arman Cohan$^β β‘$**") | |
# st.markdown("**β : Yale University, β£: Google DeepMind, β’: Salesforce Research, β‘: Allen Institute for AI**") | |
# st.markdown("**Authors**: Ansong Ni, Srini Iyer, Dragomir Radev, Ves Stoyanov, Wen-tau Yih, Sida I. Wang*, Xi Victoria Lin*") | |
# st.markdown("**Demo made by**: [Ansong Ni](https://niansong1996.github.io/)") | |
# st.markdown("**All experiment code on [GitHub](https://github.com/niansong1996/lever)**") | |
#################### START OF DEMO #################### | |
# some basic intro | |
st.image("images/l2ceval-logo.png", use_column_width="auto") | |
st.markdown("### L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models") | |
st.markdown("**Ansong Ni$^β $, Pengcheng Yin$^β£$, Yilun Zhao$^β $, Martin Riddell$^β $, Troy Feng$^β $, Rui Shen$^β $, Stephen Yin$^β $**") | |
st.markdown("**Ye Liu$^β’$, Semih Yavuz$^β’$, " \ | |
"Caiming Xiong$^β’$, Shafiq Joty$^β’$, Yingbo Zhou$^β’$, Dragomir Radev$^β $, Arman Cohan$^β β‘$**") | |
st.markdown("β : Yale University, β£: Google DeepMind, β’: Salesforce Research, β‘: Allen Institute for AI") | |
st.warning(":orange[**Site under construction π οΈ... Stay tuned!**]") | |
st.divider() | |
# st.markdown("#### Abstract") | |
# st.markdown(""" | |
# Recently, large language models (LLMs), especially those that are pretrained | |
# on code, have demonstrated strong capabilities in generating programs from | |
# natural language inputs in a few-shot or even zero-shot manner. Despite | |
# promising results, there is a notable lack of a comprehensive evaluation of | |
# these models language-to-code generation capabilities. Existing studies often | |
# focus on specific tasks, model architectures, or learning paradigms, leading to | |
# a fragmented understanding of the overall landscape. In this work, we present | |
# L2CEval, a systematic evaluation of the language-to-code generation | |
# capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing, | |
# math reasoning and Python programming, analyzing the factors that potentially | |
# affect their performance, such as model size, pretraining data, instruction | |
# tuning, and different prompting methods. In addition to assessing model | |
# performance, we measure confidence calibration for the models and conduct human | |
# evaluations of the output programs. This enables us to identify and analyze the | |
# typical failure modes across various tasks and models. L2CEval offers a | |
# comprehensive understanding of the capabilities and limitations of LLMs in | |
# language-to-code generation. We also release the evaluation framework and all | |
# model outputs, hoping to lay the groundwork for further future research in this | |
# domain. | |
# """) | |
st.markdown("#### Language-to-Code (L2C) Generation") | |
st.markdown("Langauge-to-Code (L2C) generation is a type of tasks that maps from natural language to code. It is " \ | |
"the cornerstone of many applications in AI, such as 1) chatbots; 2) coding assistants; " \ | |
"3) language interfaces for databases; 4) robotic control; etc") | |
st.image("images/pipeline.png", caption="Example of L2C tasks", use_column_width="auto") | |
st.divider() | |
st.markdown("#### L2CEval - Tasks") | |
st.markdown("We evaluate the L2C capabilities of LLMs on 7 tasks across the domain spectrum of *semantic parsing*, \ | |
*math reasoning* and *Python programming*:") | |
st.markdown(""" | |
| Domain | Dataset | Split | Size | Input | Output | | |
|------------------|--------------------------------------------------------------|-------|--------|---------------------------------|------------------------| | |
| Semantic Parsing | [Spider (Yu et al., 2018)](https://yale-lily.github.io/spider)| Dev | 1,000 | DB schema + NL | SQL Query | | |
| | [WikiTQ (Pasupat and Liang, 2015)](https://ppasupat.github.io/WikiTableQuestions) | Dev | 2,828 | Table headers + NL | SQL Query | | |
| Math Reasoning | [GSM8k (Cobbe et al., 2021)](https://github.com/openai/grade-school-math) | All | 1,494 | Math problem in NL | Python solution | | |
| | [SVAMP (Patel et al., 2021)](https://github.com/arkilpatel/SVAMP) | All | 996 | Math problem in NL | Python solution | | |
| Python Programming| [MBPP (Austin et al., 2021)](https://github.com/google-research/google-research/blob/master/mbpp/README.md) | Test | 500 | NL spec. + 1 test | Python function | | |
| | [HumanEval (Chen et al., 2021)](https://github.com/openai/human-eval) | All | 164 | NL spec. + 1-3 test | Python function | | |
| | [DS-1000 (Lai et al., 2022)](https://github.com/HKUNLP/DS-1000) | All | 1000 | NL spec. | Python lines | | |
""") | |
st.divider() | |
st.markdown("#### L2CEval - Models") | |
st.markdown(f"We evaluate {N_MODELS} models from {N_ORGS} organizations. Here is a summary of the *open-source* models we evaluated:") | |
st.markdown(""" | |
| Organization | Model Name | Release Time | Sizes | # All Tokens | # Code Tokens | Ctx. Leng. | Code Specific | Inst. Tuned | | |
|-------------------|--------------------------|--------------|----------------|--------------|---------------|------------|---------------|-------------| | |
| Salesforce | CodeGen-multi | 2022-3 | 6.1/16.1B | 505B | 119B | 2,048 | β | β | | |
| Salesforce | CodeGen-mono | 2022-3 | 6.1/16.1B | 577B | 191B | 2,048 | β | β | | |
| Salesforce | CodeGen-2.5-multi | 2023-7 | 7B | 1.4T | 1.4T | 2,048 | β | β | | |
| Salesforce | CodeGen-2.5-mono | 2023-7 | 7B | - | - | 2,048 | β | β | | |
| Salesforce | CodeGen-2.5-instruct | 2023-7 | 7B | - | - | 2,048 | β | β | | |
| Eleuther AI | GPT-J | 2021-5 | 6.1B | 402B | 46B | 2,048 | β | β | | |
| Eleuther AI | GPT-NeoX | 2022-4 | 20.6B | 472B | 54B | 2,048 | β | β | | |
| Eleuther AI | Pythia | 2023-4 | 1.4/6.9/12B | 300B | 35B | 2,048 | β | β | | |
| Databricks | Dolly-v2 | 2023-4 | 6.9/12B | - | - | 2,048 | β | β | | |
| BigCode | SantaCoder | 2023-1 | 1.1B | 236B | 236B | 2,048 | β | β | | |
| BigCode | StarCoder | 2023-5 | 15.5B | 1T | 1T | 8,192 | β | β | | |
| BigCode | StarCoderPlus | 2023-6 | 15.5B | 1.6T | 1T | 8,192 | β | β | | |
| Meta AI | InCoder | 2022-4 | 1.3/6.7B | 52B | 52B | 2,048 | β | β | | |
| Meta AI | LLaMA | 2023-2 | 6.7/13B | 1T | 45B | 2,048 | β | β | | |
| Meta AI | LLaMA-30B | 2023-2 | 32.5B | 1.4T | 63B | 2,048 | β | β | | |
| Meta AI | LLaMA-2 | 2023-7 | 7/13/70B | 2T | - | 4,096 | β | β | | |
| Meta AI | CodeLLaMA | 2023-7 | 7/13/34B | 2.5T | 435B | 16,384 | β | β | | |
| Stanford | Alpaca | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | β | β | | |
| LMSYS | Vincuna | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | β | β | | |
| Replit | Replit-code-v1-3b | 2023-5 | 2.7B | 525B | 525B | 2,048 | β | β | | |
| MosaicML | MPT-7B | 2023-5 | 7B | 1T | 135B | 2,048 | β | β | | |
| MosaicML | MPT-7B-instruct | 2023-5 | 7B | - | - | 2,048 | β | β | | |
| MosaicML | MPT-30B | 2023-6 | 30B | 1T | 135B | 8,192 | β | β | | |
| MosaicML | MPT-30B-instruct | 2023-6 | 30B | - | - | 8,192 | β | β | | |
""") | |
st.markdown("\n\n\n\n") | |
st.markdown("In addition, we also evaluated the following *proprietary* models:") | |
st.markdown(""" | |
- OpenAI GPT-4 | |
- OpenAI GPT-3.5-turbo | |
- OpenAI text-davinci-002 | |
- OpenAI text-davinci-003 | |
- OpenAI code-davinci-002 | |
- OpenAI code-cushman-001 | |
""") | |
st.divider() | |
# read results from csv | |
# results = pd.read_csv("data/scatter.csv") | |
# st.info(results.to_markdown()) | |
# st.info(results.columns) | |
# st.info(results.dtypes) | |
# st.scatter_chart( | |
# results, | |
# x="Model Size", | |
# y="Avg. Perf.", | |
# color='Model Series', | |
# # size='Avg. Perf.', | |
# ) | |
# chart_data = pd.DataFrame(np.random.randn(20, 3), columns=["col1", "col2", "col3"]) | |
# chart_data['col4'] = np.random.choice(['A','B','C'], 20) | |
# st.info(chart_data.to_markdown()) | |
# st.info(chart_data.dtypes) | |
# st.scatter_chart( | |
# chart_data, | |
# x='col1', | |
# y='col2', | |
# color='col4', | |
# size='col3', | |
# ) | |
st.markdown("#### All Results (coming soon!)") | |
# st.image("images/all_results.png", use_column_width="auto") |