Spaces:
Runtime error
Runtime error
niansong1996
commited on
Commit
β’
900ec53
1
Parent(s):
485a5d0
first commit
Browse files- .gitattributes +1 -0
- app.py +146 -0
- images/l2ceval-logo.png +3 -0
- images/pipeline.png +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import math
|
3 |
+
import random
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
import pandas as pd
|
7 |
+
import time
|
8 |
+
|
9 |
+
# define some constants
|
10 |
+
CODE_LLM = "Codex"
|
11 |
+
DEFAULT_FIRST_EXAMPLE_IDX = 47
|
12 |
+
MAX_STAGE = 5
|
13 |
+
DEFAULT_TOP_K_EXAMPLES = 10
|
14 |
+
DATASET_NAMES = ["Spider", "WikiTQ", "GSM8k", "MBPP"]
|
15 |
+
RESULT_FILES_DICTS = {
|
16 |
+
"Spider": "demo-spider-codex-results.jsonl",
|
17 |
+
"WikiTQ": "demo-wikitq-codex-results.jsonl",
|
18 |
+
"GSM8k": "demo-gsm8k-codex-results.jsonl",
|
19 |
+
"MBPP": "demo-mbpp-codex-results.jsonl"
|
20 |
+
}
|
21 |
+
|
22 |
+
N_MODELS = 21
|
23 |
+
N_ORGS = 10
|
24 |
+
|
25 |
+
#################### Setups must go first ####################
|
26 |
+
st.set_page_config(layout="wide")
|
27 |
+
|
28 |
+
#################### Side Bar ####################
|
29 |
+
with st.sidebar:
|
30 |
+
st.markdown("# About")
|
31 |
+
st.markdown("**L2CEval** is a framework for evaluating Language-to-Code generation for LLMs.")
|
32 |
+
with st.expander(":blue[**Authors**]", expanded=False):
|
33 |
+
st.markdown("**Ansong Ni$^β $, Pengcheng Yin$^β£$, Yilun Zhao$^β $, Martin Riddell$^β $, Troy Feng$^β $, Rui Shen$^β $, Stephen Yin$^β $, Ye Liu$^β’$, Semih Yavuz$^β’$, " \
|
34 |
+
"Caiming Xiong$^β’$, Shafiq Joty$^β’$, Yingbo Zhou$^β’$, Dragomir Radev$^β $, Arman Cohan$^β β‘$**")
|
35 |
+
st.markdown("**β : Yale University, β£: Google DeepMind, β’: Salesforce Research, β‘: Allen Institute for AI**")
|
36 |
+
# st.markdown("**Authors**: Ansong Ni, Srini Iyer, Dragomir Radev, Ves Stoyanov, Wen-tau Yih, Sida I. Wang*, Xi Victoria Lin*")
|
37 |
+
# st.markdown("**Demo made by**: [Ansong Ni](https://niansong1996.github.io/)")
|
38 |
+
# st.markdown("**All experiment code on [GitHub](https://github.com/niansong1996/lever)**")
|
39 |
+
|
40 |
+
#################### START OF DEMO ####################
|
41 |
+
|
42 |
+
# some basic intro
|
43 |
+
# st.image("")
|
44 |
+
st.markdown("![Sample Image](./images/l2ceval-logo.png)")
|
45 |
+
|
46 |
+
st.markdown("### L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models")
|
47 |
+
st.markdown("**Ansong Ni$^β $, Pengcheng Yin$^β£$, Yilun Zhao$^β $, Martin Riddell$^β $, Troy Feng$^β $, Rui Shen$^β $, Stephen Yin$^β $**")
|
48 |
+
st.markdown("**Ye Liu$^β’$, Semih Yavuz$^β’$, " \
|
49 |
+
"Caiming Xiong$^β’$, Shafiq Joty$^β’$, Yingbo Zhou$^β’$, Dragomir Radev$^β $, Arman Cohan$^β β‘$**")
|
50 |
+
st.markdown("β : Yale University, β£: Google DeepMind, β’: Salesforce Research, β‘: Allen Institute for AI")
|
51 |
+
|
52 |
+
st.info("###### Currently we are only displaying the main results from the paper, we will add more interactive demos later. Stay tuned!")
|
53 |
+
|
54 |
+
st.divider()
|
55 |
+
|
56 |
+
# st.markdown("#### Abstract")
|
57 |
+
# st.markdown("""
|
58 |
+
# Recently, large language models (LLMs), especially those that are pretrained
|
59 |
+
# on code, have demonstrated strong capabilities in generating programs from
|
60 |
+
# natural language inputs in a few-shot or even zero-shot manner. Despite
|
61 |
+
# promising results, there is a notable lack of a comprehensive evaluation of
|
62 |
+
# these models language-to-code generation capabilities. Existing studies often
|
63 |
+
# focus on specific tasks, model architectures, or learning paradigms, leading to
|
64 |
+
# a fragmented understanding of the overall landscape. In this work, we present
|
65 |
+
# L2CEval, a systematic evaluation of the language-to-code generation
|
66 |
+
# capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing,
|
67 |
+
# math reasoning and Python programming, analyzing the factors that potentially
|
68 |
+
# affect their performance, such as model size, pretraining data, instruction
|
69 |
+
# tuning, and different prompting methods. In addition to assessing model
|
70 |
+
# performance, we measure confidence calibration for the models and conduct human
|
71 |
+
# evaluations of the output programs. This enables us to identify and analyze the
|
72 |
+
# typical failure modes across various tasks and models. L2CEval offers a
|
73 |
+
# comprehensive understanding of the capabilities and limitations of LLMs in
|
74 |
+
# language-to-code generation. We also release the evaluation framework and all
|
75 |
+
# model outputs, hoping to lay the groundwork for further future research in this
|
76 |
+
# domain.
|
77 |
+
# """)
|
78 |
+
|
79 |
+
st.markdown("#### Language-to-Code (L2C) Generation")
|
80 |
+
st.markdown("Langauge-to-Code (L2C) generation is a type of tasks that maps from natural language to code. It is " \
|
81 |
+
"the cornerstone of many applications in AI, such as 1) chatbots; 2) coding assistants; " \
|
82 |
+
"3) language interfaces for databases; 4) robotic control; etc")
|
83 |
+
st.image("images/pipeline.png", caption="Example of L2C tasks", use_column_width="auto")
|
84 |
+
|
85 |
+
st.divider()
|
86 |
+
|
87 |
+
|
88 |
+
st.markdown("#### L2CEval - Tasks")
|
89 |
+
st.markdown("We evaluate the L2C capabilities of LLMs on 7 tasks across the domain spectrum of *semantic parsing*, \
|
90 |
+
*math reasoning* and *Python programming*:")
|
91 |
+
st.markdown("""
|
92 |
+
| Domain | Dataset | Split | Size | Input | Output |
|
93 |
+
|------------------|--------------------------------------------------------------|-------|--------|---------------------------------|------------------------|
|
94 |
+
| Semantic Parsing | [Spider (Yu et al., 2018)](https://yale-lily.github.io/spider)| Dev | 1,000 | DB schema + NL | SQL Query |
|
95 |
+
| | [WikiTQ (Pasupat and Liang, 2015)](https://ppasupat.github.io/WikiTableQuestions) | Dev | 2,828 | Table headers + NL | SQL Query |
|
96 |
+
| Math Reasoning | [GSM8k (Cobbe et al., 2021)](https://github.com/openai/grade-school-math) | All | 1,494 | Math problem in NL | Python solution |
|
97 |
+
| | [SVAMP (Patel et al., 2021)](https://github.com/arkilpatel/SVAMP) | All | 996 | Math problem in NL | Python solution |
|
98 |
+
| Python Programming| [MBPP (Austin et al., 2021)](https://github.com/google-research/google-research/blob/master/mbpp/README.md) | Test | 500 | NL spec. + 1 test | Python function |
|
99 |
+
| | [HumanEval (Chen et al., 2021)](https://github.com/openai/human-eval) | All | 164 | NL spec. + 1-3 test | Python function |
|
100 |
+
| | [DS-1000 (Lai et al., 2022)](https://github.com/HKUNLP/DS-1000) | All | 1000 | NL spec. | Python lines |
|
101 |
+
""")
|
102 |
+
|
103 |
+
st.divider()
|
104 |
+
|
105 |
+
st.markdown("#### L2CEval - Models")
|
106 |
+
st.markdown(f"We evaluate {N_MODELS} models from {N_ORGS} organizations. Here is a summary of the *open-source* models we evaluated:")
|
107 |
+
st.markdown("""
|
108 |
+
| Organization | Model Name | Release Time | Sizes | # All Tokens | # Code Tokens | Ctx. Leng. | Code Specific | Inst. Tuned |
|
109 |
+
|-------------------|--------------------------|--------------|----------------|--------------|---------------|------------|---------------|-------------|
|
110 |
+
| Salesforce | CodeGen-multi | 2022-3 | 6.1/16.1B | 505B | 119B | 2,048 | β | β |
|
111 |
+
| Salesforce | CodeGen-mono | 2022-3 | 6.1/16.1B | 577B | 191B | 2,048 | β | β |
|
112 |
+
| Salesforce | CodeGen-2.5-multi | 2023-7 | 7B | 1.4T | 1.4T | 2,048 | β | β |
|
113 |
+
| Salesforce | CodeGen-2.5-mono | 2023-7 | 7B | - | - | 2,048 | β | β |
|
114 |
+
| Salesforce | CodeGen-2.5-instruct | 2023-7 | 7B | - | - | 2,048 | β | β |
|
115 |
+
| Eleuther AI | GPT-J | 2021-5 | 6.1B | 402B | 46B | 2,048 | β | β |
|
116 |
+
| Eleuther AI | GPT-NeoX | 2022-4 | 20.6B | 472B | 54B | 2,048 | β | β |
|
117 |
+
| Eleuther AI | Pythia | 2023-4 | 1.4/6.9/12B | 300B | 35B | 2,048 | β | β |
|
118 |
+
| Databricks | Dolly-v2 | 2023-4 | 6.9/12B | - | - | 2,048 | β | β |
|
119 |
+
| BigCode | SantaCoder | 2023-1 | 1.1B | 236B | 236B | 2,048 | β | β |
|
120 |
+
| BigCode | StarCoder | 2023-5 | 15.5B | 1T | 1T | 8,192 | β | β |
|
121 |
+
| BigCode | StarCoderPlus | 2023-6 | 15.5B | 1.6T | 1T | 8,192 | β | β |
|
122 |
+
| Meta AI | InCoder | 2022-4 | 1.3/6.7B | 52B | 52B | 2,048 | β | β |
|
123 |
+
| Meta AI | LLaMA | 2023-2 | 6.7/13B | 1T | 45B | 2,048 | β | β |
|
124 |
+
| Meta AI | LLaMA-30B | 2023-2 | 32.5B | 1.4T | 63B | 2,048 | β | β |
|
125 |
+
| Meta AI | LLaMA-2 | 2023-7 | 7/13/70B | 2T | - | 4,096 | β | β |
|
126 |
+
| Meta AI | CodeLLaMA | 2023-7 | 7/13/34B | 2.5T | 435B | 16,384 | β | β |
|
127 |
+
| Stanford | Alpaca | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | β | β |
|
128 |
+
| LMSYS | Vincuna | 2023-3 | 6.7/13/32.5B | - | - | 2,048 | β | β |
|
129 |
+
| Replit | Replit-code-v1-3b | 2023-5 | 2.7B | 525B | 525B | 2,048 | β | β |
|
130 |
+
| MosaicML | MPT-7B | 2023-5 | 7B | 1T | 135B | 2,048 | β | β |
|
131 |
+
| MosaicML | MPT-7B-instruct | 2023-5 | 7B | - | - | 2,048 | β | β |
|
132 |
+
| MosaicML | MPT-30B | 2023-6 | 30B | 1T | 135B | 8,192 | β | β |
|
133 |
+
| MosaicML | MPT-30B-instruct | 2023-6 | 30B | - | - | 8,192 | β | β |
|
134 |
+
""")
|
135 |
+
|
136 |
+
st.markdown("\n\n\n\n")
|
137 |
+
|
138 |
+
st.markdown("In addition, we also evaluated the following *proprietary* models:")
|
139 |
+
st.markdown("""
|
140 |
+
- OpenAI GPT-4
|
141 |
+
- OpenAI GPT-3.5-turbo
|
142 |
+
- OpenAI text-davinci-002
|
143 |
+
- OpenAI text-davinci-003
|
144 |
+
- OpenAI code-davinci-002
|
145 |
+
- OpenAI code-cushman-001
|
146 |
+
""")
|
images/l2ceval-logo.png
ADDED
Git LFS Details
|
images/pipeline.png
ADDED
Git LFS Details
|