Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

App Files Files Community

SeaEval_Leaderboard / app /pages.py

zhuohan-7

Upload folder using huggingface_hub

a12f124 verified 4 months ago

raw

history blame

12.8 kB

	import streamlit as st
	from app.draw_diagram import *

	def dashboard():

	with st.container():
	st.title("SeaEval")

	st.markdown("""
	[gh]: https://github.com/SeaEval/SeaEval
	[![GitHub watchers](https://img.shields.io/github/watchers/SeaEval/SeaEval?style=social)][gh]
	[![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh]
	""")

	seaeval_url = "https://seaeval.github.io/"

	st.divider()
	st.markdown("#### What is [SeaEval](%s)?" % seaeval_url)

	with st.container():
	left_co, cent_co,last_co = st.columns(3)
	with cent_co:
	st.image("./style/seaeval_overall.png",
	# caption="SeaEval data range",
	width=500)
	st.markdown('''

	''')
	st.markdown("##### A new benchmark for multilingual, multicultral foundation model evaluation consisting of 28 dataset as the core and keep expanding over time.")
	st.markdown(''':star: How models understand and reason with natural language?
	:balloon: Languages: English, Chinese, Malay, Spainish, Indonedian, Vietnamese, Filipino.
	''')

	st.markdown(''':star: How models comprehend cultural practices, nuances and values?
	:balloon: 4 new datasets on Cultural Understanding.
	''')

	st.markdown(''':star: How models perform across languages in terms of consistency?
	:balloon: 2 new datasets with curated metrics for Cross-Linugal Consistency.
	''')
	with st.container():
	left_co, cent_co,last_co = st.columns(3)
	with cent_co:
	st.image("./style/consistency.png",
	# caption="SeaEval data range",
	width=500)
	st.markdown("##### Evaluation with enhanced cross-lingual capabilities.")
	st.markdown(''':star: How models perform according to different (paraphrased) instructions?
	:balloon: Each dataset is equipped with 5 different prompts to avoid randomness introduced by instructions,
	which is non-negligible..
	''')

	st.markdown(''':star: Multilingual accuracy and performance consistency across languages.
	:balloon: If you can answer the question in your native language, can you answer the same question
	correctly in your second/third language?
	''')

	st.divider()
	with st.container():
	st.markdown("##### Citations")

	st.markdown('''
	:round_pushpin: SeaEval Paper \n
	@article{SeaEval,
	title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
	author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
	journal={NAACL},
	year={2024}
	}
	''')

	def cross_lingual_consistency():
	st.title("Cross-Lingual Consistency")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = ['Cross-MMLU', 'Cross-XQUAD', 'Cross-LogiQA']

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'Cross-MMLU': 'cross_mmlu',
	'Cross-XQUAD': 'cross_xquad',
	'Cross-LogiQA': 'cross_logiqa'}

	left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with middle:
	sort = st.selectbox('Sort', ['Accuracy','Cross-Lingual Consistency', 'AC3',
	'English', 'Chinese', 'Spanish', 'Vietnamese'])
	with right:
	sortby = st.selectbox('by', ['Ascending', 'Descending'])

	if category_one or category_two or sort or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]

	draw('cross_lingual', category_one, category_two, sort, sortby)
	# else:
	# draw('zero_shot', 'cross_mmlu', 'Accuracy', 'Descending')

	def cultural_reasoning():
	st.title("Cultural Reasoning")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = [
	'SG EVAL V2 MCQ',
	'SG EVAL V2 Open Ended',
	'SG EVAL',
	'SG EVAL V1 Cleaned',
	'CN EVAL',
	'PH EVAL',
	'US EVAL'
	]

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'SG EVAL': 'sg_eval',
	'SG EVAL V1 Cleaned': 'sg_eval_v1_cleaned',
	'SG EVAL V2 MCQ': 'sg_eval_v2_mcq',
	'SG EVAL V2 Open Ended': 'sg_eval_v2_open',
	'US EVAL': 'us_eval',
	'CN EVAL': 'cn_eval',
	'PH EVAL': 'ph_eval'}

	left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with right:
	sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])

	if category_one or category_two or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]
	draw('cultural_reasoning', category_one, category_two, 'Accuracy',sortby)
	# else:
	# draw_only_acc('cultural_reasoning', 'zero_shot', 'sg_eval', 'Descending')


	def general_reasoning():
	st.title("General Reasoning")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = [
	'MMLU',
	'CMMLU',
	'IndoMMLU',
	'C Eval',
	'ZBench',
	]

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'MMLU': 'mmlu',
	'C Eval': 'c_eval',
	'CMMLU': 'cmmlu',
	'ZBench': 'zbench',
	'IndoMMLU': 'indommlu'}

	left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with right:
	sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])

	if category_one or category_two or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]
	draw('general_reasoning', category_one, category_two, 'Accuracy',sortby)
	# else:
	# draw_only_acc('general_reasoning', 'zero_shot', 'MMLU Full', 'Descending')

	def flores():
	st.title("FLORES-Translation")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = ['Indonesian to English',
	'Vitenamese to English',
	'Chinese to English',
	'Malay to English'
	]

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'Indonesian to English': 'ind2eng',
	'Vitenamese to English': 'vie2eng',
	'Chinese to English': 'zho2eng',
	'Malay to English': 'zsm2eng'}


	left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with right:
	sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])

	if category_one or category_two or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]
	draw('flores_translation', category_one, category_two, 'BLEU',sortby)
	# else:
	# draw_flores_translation('zero_shot', 'Indonesian to English', 'Descending')

	def emotion():
	st.title("Emotion")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = [
	'Indonesian Emotion Classification',
	'SST2',
	]

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'Indonesian Emotion Classification': 'ind_emotion',
	'SST2': 'sst2'}

	left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with right:
	sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])

	if category_one or category_two or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]
	draw('emotion', category_one, category_two, 'Accuracy', sortby)
	# else:
	# draw_only_acc('emotion', 'zero_shot', 'Indonesian Emotion Classification', 'Descending')

	def dialogue():
	st.title("Dialogue")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = [
	'DREAM',
	'SAMSum',
	'DialogSum',
	]

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'DREAM': 'dream',
	'SAMSum': 'samsum',
	'DialogSum': 'dialogsum'}

	left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with middle:
	if category_two == 'DREAM':
	sort = st.selectbox('Sort', ['Accuracy'])
	else:
	sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'])

	with right:
	sortby = st.selectbox('by', ['Ascending', 'Descending'])

	if category_one or category_two or sort or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]
	draw('dialogue', category_one, category_two, sort, sortby)
	# else:
	# draw_dialogue('zero_shot', 'DREAM', sort[0],'Descending')

	def fundamental_nlp_tasks():
	st.title("Fundamental NLP Tasks")

	filters_levelone = ['Zero Shot', 'Few Shot']
	filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC']

	category_one_dict = {'Zero Shot': 'zero_shot',
	'Few Shot': 'few_shot'}
	category_two_dict = {'OCNLI': 'ocnli',
	'C3': 'c3',
	'COLA': 'cola',
	'QQP': 'qqp',
	'MNLI': 'mnli',
	'QNLI': 'qnli',
	'WNLI': 'wnli',
	'RTE': 'rte',
	'MRPC': 'mrpc'}

	left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
	with left:
	category_one = st.selectbox('Select Zero / Few shot', filters_levelone)
	with center:
	category_two = st.selectbox('Select the sub-category', filters_leveltwo)
	with right:
	sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])

	if category_one or category_two or sortby:
	category_one = category_one_dict[category_one]
	category_two = category_two_dict[category_two]
	draw('fundamental_nlp_tasks', category_one, category_two, 'Accuracy', sortby)
	# else:
	# draw_only_acc('fundamental_nlp_tasks', 'zero_shot', 'OCNLI', 'Descending')