hxiang's picture
feat: update three instruct models (#2)
23048e7 verified
from typing import List
import gradio as gr
import numpy as np
import pandas as pd
from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
#SPLITS = ["Overall", "Subclass"]
SPLITS = ["Overall", "Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
CLASSIFICATION = {
"model_size": [
">65B",
"~30B",
"10B~20B",
"5B~10B",
"API",
]
}
# _BIBTEX = """ Waiting for paper ... """
_BIBTEX = """
@misc{zhang2024chinesesafechinesebenchmarkevaluating,
title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
year={2024},
eprint={2410.18491},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2410.18491},
}
"""
_LAST_UPDATED = "November 24, 2024"
banner_url = "./assets/logo.png"
_BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>' # noqa
def retrieve_array_from_text(text):
return np.fromstring(text.replace("[", "").replace("]", ""), dtype=float, sep=",")
def format_csv_numbers(text):
return text.split('/')[0]
def format_csv_numbers_second(text):
return text.split()
def format_number(x):
return float(f"{x:.3}")
def get_dataset_csv(
model_size: List[str],
):
df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
df = df.drop(columns="Size")
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
def get_dataset_csv_per(
model_size: List[str],
):
df = ORIGINAL_DF_PER[ORIGINAL_DF_PER['Size'].isin(model_size)]
df = df.drop(columns="Size")
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
# this is a sub function for csv table
def get_dataset_csv_sub_gen(
model_size: List[str],
subclass_choice: List[str],
):
df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
df = df.drop(columns="Size")
# get subclass
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
df = df[subclass_choice_label]
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
# this is a sub function for csv table
def get_dataset_csv_sub_per(
model_size: List[str],
subclass_choice: List[str],
):
df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
df = df.drop(columns="Size")
# get subclass
subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
df = df[subclass_choice_label]
leaderboard_table = gr.components.Dataframe(
value=df,
interactive=False,
visible=True,
)
return leaderboard_table
def get_dataset_classfier_gen(
model_size: List[str],
main_choice: List[str],
):
if main_choice == "Overall":
leaderboard_table = get_dataset_csv(model_size)
elif main_choice != "Subclass":
subclass_choice = main_choice
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
return leaderboard_table
def get_dataset_classfier_per(
model_size: List[str],
main_choice: List[str],
):
if main_choice == "Overall":
leaderboard_table = get_dataset_csv_per(model_size)
elif main_choice != "Overall":
subclass_choice = main_choice
leaderboard_table = get_dataset_csv_sub_per(model_size, subclass_choice)
return leaderboard_table
with gr.Blocks() as demo:
gr.Markdown("<center><h1>ChineseSafe Leaderboard</h1></center>", elem_classes="markdown-text")
with gr.Row():
#gr.Image(banner_url, height=160, scale=1) # πŸ‘‰ this part is for image
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# gr.Textbox(_INTRODUCTION_TEXT, scale=5)
with gr.Row():
gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Column(scale=0.8):
main_choice = gr.Dropdown(
choices=SPLITS,
value="Overall",
label="Type",
info="Please choose the type to display.",
)
with gr.Column(scale=10):
model_choice = gr.CheckboxGroup(
choices=CLASSIFICATION["model_size"],
value=CLASSIFICATION["model_size"], # all be choosed
label="Model Size",
info="Please choose the model size to display.",
)
#πŸ‘‰ this part is for csv table generatived
with gr.Tabs(elem_classes="tab-buttons") as tabs:
# ----------------- modify text -----------------
with gr.TabItem("πŸ… Generation", elem_id="od-benchmark-tab-table", id=6):
dataframe_all_gen = gr.components.Dataframe(
elem_id="leaderboard-table",
)
with gr.TabItem("πŸ… Perplexity", elem_id="od-benchmark-tab-table", id=5):
dataframe_all_per = gr.components.Dataframe(
elem_id="leaderboard-table",
)
# ----------------- modify text -----------------
with gr.Row():
gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
# πŸ‘‰ this part is for citation
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=True):
gr.Textbox(
value=_BIBTEX,
lines=7,
label="Copy the BibTeX snippet to cite this source",
elem_id="citation-button",
show_copy_button=True
)
gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
# --------------------------- all --------------------------------
# this is all result Perplexity
main_choice.change(
get_dataset_classfier_per,
inputs=[model_choice, main_choice],
outputs=dataframe_all_per,
)
model_choice.change(
get_dataset_classfier_per,
inputs=[model_choice, main_choice],
outputs=dataframe_all_per,
)
demo.load(
fn=get_dataset_classfier_per,
inputs=[model_choice, main_choice],
outputs=dataframe_all_per,
)
# this is all result generatived
main_choice.change(
get_dataset_classfier_gen,
inputs=[model_choice, main_choice],
outputs=dataframe_all_gen,
)
model_choice.change(
get_dataset_classfier_gen,
inputs=[model_choice, main_choice],
outputs=dataframe_all_gen,
)
demo.load(
fn=get_dataset_classfier_gen,
inputs=[model_choice, main_choice],
outputs=dataframe_all_gen,
)
demo.launch(share=True)