MultiPL-E / app.py
arjunguha's picture
Two digits of precision
1d6280a
raw
history blame
4 kB
import gradio as gr
import pandas as pd
import numpy as np
# Dictionary mapping file extensions to full language names
extension_to_language = {
"clj": "Clojure",
"cpp": "C++",
"cs": "C#",
"d": "D",
"elixir": "Elixir",
"go": "Go",
"hs": "Haskell",
"java": "Java",
"jl": "Julia",
"js": "JavaScript",
"lua": "Lua",
"ml": "OCaml",
"php": "PHP",
"pl": "Perl",
"r": "R",
"rb": "Ruby",
"rkt": "Racket",
"rs": "Rust",
"scala": "Scala",
"sh": "Shell",
"swift": "Swift",
"ts": "TypeScript"
}
# Read the CSV file
df = pd.read_csv('passk.csv')
# Function to extract language and model from Dataset
def extract_info(dataset):
parts = dataset.split('-')
language = parts[1]
model = '-'.join(parts[2:-2])
return pd.Series({'Language': language, 'Model': model})
# Extract language and model information
df[['Language', 'Model']] = df['Dataset'].apply(extract_info)
# Create a dictionary to map models to friendly names
model_to_friendly = {
"starcoder2_15b": "StarCoder2-15B",
"deepseekcoder_v2lite_base": "DeepSeekCoder2-Lite-Base"
}
# Function to get friendly name or original name if not in the dictionary
def get_friendly_name(model):
return model_to_friendly.get(model, model)
# Create a pivot table
pivot = df.pivot(index='Model', columns='Language', values='Estimate')
# Get unique languages and models
languages = sorted(pivot.columns)
models = sorted(pivot.index)
# Function to update the table based on selected languages
def update_table(selected_languages):
if not selected_languages:
return pd.DataFrame({'Model': [get_friendly_name(model) for model in models]})
display_data = pivot[selected_languages].replace(np.nan, "-")
display_data = display_data.applymap(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
# Add the Model column as the first column
display_data.insert(0, 'Model', [get_friendly_name(model) for model in display_data.index])
# Reset the index to remove the model names from the index
display_data = display_data.reset_index(drop=True)
# Rename columns to full language names
display_data.columns = ['Model'] + [extension_to_language.get(lang, lang) for lang in selected_languages]
return display_data
# Function to get initial table data
def get_initial_table():
return update_table(languages)
# Create the Gradio interface
with gr.Blocks() as app:
gr.Markdown("""
# MultiPL-E Results
[MultiPL-E](https://huggingface.co/datasets/nuprl/MultiPL-E) is a dataset for
evaluating large language models for code generation that supports several
programming languages. It takes the OpenAI HumanEval and the Mostly Basic
Python Programs (MBPP) benchmarks and uses little compilers to translate them
to other languages. It is easy to add support for new languages and benchmarks.
This table shows how some recent Code LLMs perform on MultiPL-HumanEval.
We use the MultiPL-E 3.0 problems, which incorporates several fixes and
supports several new programming languages.
""")
with gr.Row():
language_checkboxes = gr.CheckboxGroup(
choices=[f"{extension_to_language[lang]} ({lang})" for lang in languages],
label="Select Languages",
value=[f"{extension_to_language[lang]} ({lang})" for lang in languages]
)
table = gr.Dataframe(
value=get_initial_table,
headers=['Model'] + [extension_to_language.get(lang, lang) for lang in languages],
type="pandas"
)
def update_table_wrapper(selected_languages):
# Extract language codes from the selected full names
selected_codes = [lang.split('(')[-1].strip(')') for lang in selected_languages]
return update_table(selected_codes)
language_checkboxes.change(update_table_wrapper, inputs=[language_checkboxes], outputs=[table])
# Launch the app
if __name__ == "__main__":
app.launch()