import gradio as gr import pandas as pd import numpy as np import plotly.express as px import random import plotly.graph_objects as go file_result_score = 'ko_bench.csv' file_full_lb = 'mt_bench_240805.csv' # read csv df_result_score = pd.read_csv(file_result_score) df_full_lb = pd.read_csv(file_full_lb) # dataframe df = pd.DataFrame(df_result_score) df_rs = pd.DataFrame(df_result_score) df_full_lboard = pd.DataFrame(df_full_lb) df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench의 GPT-4-1106-preview 를 gpt-4-0125-preview로 변경 models = df_full_lboard['Model'].unique() # 열 추가를 위한 models 리스트 df_rs.replace("", np.nan, inplace=True) # 모델별 turn1,2 score 합병 def custom_mean(series): numeric_series = pd.to_numeric(series, errors='coerce') # 시리즈를 숫자로 변환 return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN이 아닌 값이 하나라도 있으면 평균 계산 def get_mt_bench(model): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의 model_lower = model.lower() matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower] if not matching_rows.empty: return matching_rows['MT-bench (score)'].values[0] return '' def get_organization(model): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의 if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any(): return 'Mistral' elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any(): return 'KISTI' model_lower = model.lower() matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower] if not matching_rows.empty: return matching_rows['Organization'].values[0] return '' def get_license(model): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의 if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any(): return 'Apache-2.0' elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any(): return 'llama3' model_lower = model.lower() matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower] if not matching_rows.empty: return matching_rows['License'].values[0] return '' # dataframe_full df_full_rs = df_rs.copy() df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True) df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing']) df_full_rs = df_full_rs.drop(columns=['turn']) # 모델별 turn1,2 score 합병 df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model', 'judge_model']}).reset_index() df_full_rs = df_full_rs.round(2) df_full_rs.replace("", np.nan, inplace=True) df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval 열 추가 df_full_rs['KO-Bench/keval'] = '' for idx, j_model in df_full_rs['judge_model'].items(): if j_model == 'keval': df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench'] else : df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench'] df_full_rs = df_full_rs.drop(columns=['judge_model']) df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval 행 합병 df_full_rs = df_full_rs.round(2) df_full_rs.replace("", np.nan, inplace=True) df_full_rs['MT-Bench'] = '' # MT-Bench 열 추가 df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench) df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False) df_full_rs['Organization'] = '' # Organization 열 추가 df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization) df_full_rs['License'] = '' # License 열 추가 df_full_rs['License'] = df_full_rs['model'].apply(get_license) df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False) df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1)) df_full_rs = df_full_rs.drop(columns=['KO-Bench']) plot_models = df_full_rs['model'].unique() # model detail view를 위한 models 리스트 # dataframe df_rs['MT-Bench'] = '' # MT-Bench 열 추가 df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench) df_rs['MT-Bench'] = df_rs['MT-Bench'].str.replace('-', '', regex=False) df_rs.replace("", np.nan, inplace=True) # 모델별 turn1,2 score 합병 # dataframe_openai df_openai = pd.DataFrame(df_rs) df_openai = df_openai[df_openai['judge_model'] != 'keval'] df_openai = df_openai.drop(columns=['judge_model', 'turn']) # 모델별 turn1,2 score 합병 df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index() df_openai = df_openai.round(2) df_openai = df_openai.sort_values(by='score', ascending=False) df_openai.insert(0, 'rank', range(1, len(df_openai) + 1)) # dataframe_keval df_keval = pd.DataFrame(df_rs) df_keval = df_keval[df_keval['judge_model'] == 'keval'] df_keval = df_keval.drop(columns=['judge_model', 'turn']) # 모델별 turn1,2 score 합병 df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index() df_keval = df_keval.round(2) df_keval = df_keval.sort_values(by='score', ascending=False) df_keval.insert(0, 'rank', range(1, len(df_keval) + 1)) # model detail view plot_models_list = plot_models.tolist() CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"] category_labels = ['Selected model turn1', 'Selected model turn2', 'Top1 turn1', 'Top1 turn2'] random.seed(42) def search_dataframe(query): # df 검색 함수 정의 if not query: return df # 검색어가 없을 경우 전체 DataFrame 반환 filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)] return filtered_df def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_turn1, Top1_turn2): # plot 그리는 함수 #categories = categories.split(',') Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist] Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist] Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist] Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist] values_lists = [ list(map(float, Selected_model_turn1)), list(map(float, Selected_model_turn2)), list(map(float, Top1_turn1)), list(map(float, Top1_turn2)) ] fig = go.Figure() for i, values in enumerate(values_lists): if len(categories) != len(values): return f"Error in dataset {i+1}: Number of categories and values must be the same." fig.add_trace(go.Scatterpolar( r=values + [values[0]], # Closing the loop of the radar chart theta=categories + [categories[0]], # Closing the loop of the radar chart mode='lines', name=category_labels[i] # Label for the dataset )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, max(max(values) for values in values_lists)], showline=True, ), angularaxis=dict( rotation=0, direction='clockwise' ) ), showlegend=True, width=555, # 적절한 너비 설정 height=550, # 적절한 높이 설정 margin=dict(l=1000, r=20, t=20, b=20), autosize = False, paper_bgcolor='white', plot_bgcolor='lightgrey' ) return fig def search_openai_plot(dropdown_model): # openai plot 함수 정의 condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model) openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist() condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model) openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist() condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.loc[0,'model']) top1_openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist() condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.loc[0,'model']) top1_openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist() fig = radar_chart(CATEGORIES, openai_turn1, openai_turn2, top1_openai_turn1, top1_openai_turn2) return fig def search_keval_plot(dropdown_model): # keval plot 함수 정의 condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model) keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist() condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model) keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist() condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.loc[0,'model']) top1_keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist() condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.loc[0,'model']) top1_keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist() fig = radar_chart(CATEGORIES, keval_turn1, keval_turn2, top1_keval_turn1, top1_keval_turn2) return fig #gradio with gr.Blocks() as demo: gr.Markdown("") gr.Markdown("# 🏆 KO-Bench Leaderboard") gr.Markdown("") gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).") gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.") gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.") gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.") gr.Markdown("") gr.Markdown("github : https://github.com/davidkim205/ko-bench") gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1") gr.Markdown("") with gr.TabItem("KO-Bench"): gr.Dataframe(value=df_full_rs) with gr.TabItem("Openai Judgment"): gr.Dataframe(value=df_openai) with gr.TabItem("Keval Judgment"): gr.Dataframe(value=df_keval) with gr.TabItem("Model Detail View"): with gr.Blocks(): with gr.Row(): dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model") with gr.Row(): dataframe = gr.Dataframe(label="Model Detail View") dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe) with gr.Row(): plot_openai = gr.Plot(label="Openai Plot") dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai) #with gr.Row(): plot_keval = gr.Plot(label="Keval Plot") dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval) demo.launch(share=True, server_name="0.0.0.0")