Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from src.css_html import custom_css | |
from src.utils import ( | |
AutoEvalColumn, | |
fields, | |
make_clickable_names, | |
make_plot_data | |
) | |
from src.demo import ( | |
generate, | |
random_examples, | |
) | |
DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." | |
MAX_MAX_NEW_TOKENS = 1024 | |
DEFAULT_MAX_NEW_TOKENS = 512 | |
df = pd.read_csv("data/eval_board.csv") | |
COLS = [c.name for c in fields(AutoEvalColumn)] | |
TYPES = [c.type for c in fields(AutoEvalColumn)] | |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] | |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] | |
def add_new_eval( | |
model: str, | |
re2text_easy_precision: str, | |
re2text_hard_precision: str, | |
text2re_easy_precision: str, | |
text2re_hard_precision: str, | |
links: str, | |
): | |
print("adding new eval") | |
eval_entry = { | |
"model": model, | |
"re2text_easy": re2text_easy_precision, | |
"re2text_hard": re2text_hard_precision, | |
"text2re_easy": text2re_easy_precision, | |
"text2re_hard": text2re_hard_precision, | |
"link": links | |
} | |
def select_columns(df, columns): | |
always_here_cols = [ | |
AutoEvalColumn.model.name | |
] | |
# We use COLS to maintain sorting | |
filtered_df = df[ | |
always_here_cols + [c for c in COLS if c in df.columns and c in columns] | |
] | |
return filtered_df | |
df["pure_name"] = df['Models'] | |
df = make_clickable_names(df) | |
demo = gr.Blocks(css=custom_css) | |
with demo: | |
with gr.Row(): | |
gr.Markdown( | |
"""<div style="text-align: center;"><h1> 🤖ConvRe🤯 <span style='color: #e6b800;'>Leaderboard</span></h1></div>\ | |
<br>\ | |
<p> 🤖ConvRe🤯 is the benchmark proposed in our EMNLP 2023 paper: <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"> An Investigation of LLMs’ Inefficacy in Understanding Converse Relations</a>. It aims to evaluate LLMs' ability on understanding converse relations. Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged. For example, the triple (x, has part, y) is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁. | |
The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4. | |
</p>""", | |
elem_classes="markdown-text", | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("🔢 Data", id=0): | |
with gr.Accordion("➡️ See All Columns", open=False): | |
shown_columns = gr.CheckboxGroup( | |
choices=[ | |
c for c in COLS if c not in [AutoEvalColumn.model.name] | |
], | |
value=[ | |
c for c in COLS_LITE if c not in [AutoEvalColumn.model.name] | |
], | |
label="", | |
elem_id="column-select", | |
interactive=True | |
) | |
leaderboard_df_re2text = gr.components.Dataframe( | |
value=df[ | |
[ | |
AutoEvalColumn.model.name, | |
] + shown_columns.value | |
], | |
headers=[ | |
AutoEvalColumn.model.name, | |
] + shown_columns.value, | |
datatype=TYPES, | |
elem_id="leaderboard-table", | |
interactive=False, | |
) | |
hidden_leaderboard_df_re2text = gr.components.DataFrame( | |
value=df, | |
headers=COLS, | |
datatype=["str" for _ in range(len(COLS))], | |
visible=False, | |
) | |
shown_columns.change( | |
select_columns, | |
[hidden_leaderboard_df_re2text, shown_columns], | |
leaderboard_df_re2text | |
) | |
with gr.TabItem("📊 Plot", id=1): | |
with gr.Row(): | |
with gr.Column(): | |
gr.LinePlot( | |
make_plot_data(df, task="Re2Text"), | |
x="Setting", | |
y="Accuracy", | |
color="Symbol", | |
title="Re2Text", | |
y_lim=[0, 100], | |
x_label_angle=0, | |
height=400, | |
width=500, | |
) | |
with gr.Column(): | |
gr.LinePlot( | |
make_plot_data(df, task="Text2Re"), | |
x="Setting", | |
y="Accuracy", | |
color="Symbol", | |
title="Text2Re", | |
y_lim=[0, 100], | |
x_label_angle=0, | |
height=400, | |
width=500, | |
) | |
with gr.TabItem("Submit results 🚀", id=3): | |
gr.Markdown("Submit Here") | |
with gr.Column(): | |
gr.Markdown( | |
"""<div style="text-align: center;"><h2> 🤖ConvRe🤯 Demo </h2></div>\ | |
<br>\ | |
""", | |
elem_classes="markdown-text", | |
) | |
output_box = gr.Textbox(lines=10, max_lines=10, label="ChatBot") | |
input_box = gr.Textbox(lines=12, max_lines=12, label="Input") | |
with gr.Row(): | |
re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄") | |
re2text_easy_btn.click( | |
fn=random_examples, | |
inputs=gr.Text("re2text-easy", visible=False), | |
outputs = input_box, | |
) | |
re2text_hard_btn = gr.Button("Random Re2Text Hard Example 🤯") | |
re2text_hard_btn.click( | |
fn=random_examples, | |
inputs=gr.Text("re2text-hard", visible=False), | |
outputs=input_box, | |
) | |
text2re_easy_btn = gr.Button("Random Text2Re Easy Example 😄") | |
text2re_easy_btn.click( | |
fn=random_examples, | |
inputs=gr.Text("text2re-easy", visible=False), | |
outputs = input_box, | |
) | |
text2re_hard_btn = gr.Button("Random Text2Re Hard Example 🤯") | |
text2re_hard_btn.click( | |
fn=random_examples, | |
inputs=gr.Text("text2re-hard", visible=False), | |
outputs = input_box, | |
) | |
with gr.Accordion("Additional Inputs", open=False): | |
sys_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6) | |
gr.Slider( | |
label="Max new tokens", | |
minimum=1, | |
maximum=MAX_MAX_NEW_TOKENS, | |
step=1, | |
value=DEFAULT_MAX_NEW_TOKENS, | |
) | |
gr.Slider( | |
label="Temperature", | |
minimum=0, | |
maximum=4.0, | |
step=0.05, | |
value=0, | |
) | |
with gr.Row(): | |
gr.ClearButton([input_box, output_box]) | |
submit_btn = gr.Button("Submit") | |
submit_btn.click(generate, inputs=[input_box, sys_prompt], outputs=[output_box]) | |
demo.launch() |