Spaces:

TheFinAI
/

IJCAI-2024-FinLLM-Learderboard

Running

IJCAI-2024-FinLLM-Learderboard / app.py

Jimin Huang

feat: modify leaderboard

670a324 7 months ago

4.62 kB

	# matplotlib.use('macosx')
	import gradio as gr
	import matplotlib
	import numpy as np
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler

	TASK1_COLS = [
	("Model", "str"),
	("Acc", "number"),
	("F1", "number"),
	("MCC", "number"),
	]

	TASK2_COLS = [
	("Model", "str"),
	("Rouge-1", "number"),
	("Rouge-2", "number"),
	("Rouge-L", "number"),
	("BertScore", "number"),
	("BartScore", "number"),
	]

	TASK3_COLS = [
	("Model", "str"),
	("Sharpe Ratio", "number"),
	("Sharpe Ratio - DRIV", "number"),
	("Sharpe Ratio - FORM", "number"),
	("Sharpe Ratio - JNJ", "number"),
	("Sharpe Ratio - MSFT", "number"),
	]


	# Extract column names
	task1_cols = [col_name for col_name, _ in TASK1_COLS]
	task2_cols = [col_name for col_name, _ in TASK2_COLS]
	task3_cols = [col_name for col_name, _ in TASK3_COLS]


	def create_df_dict(lang, lang_cols):
	# Load leaderboard data with column names
	leaderboard_df = pd.read_csv(f"{lang}_result.csv", names=lang_cols)
	leaderboard_df = leaderboard_df.sort_index(axis=1)
	# Move 'key' column to the front
	leaderboard_df = leaderboard_df[["Model"] + [col for col in leaderboard_df.columns if col != "Model"]]
	cols = leaderboard_df.columns
	types = ["str"] + ["number"] * (len(lang_cols) - 1)

	# Split merged_df into subtask dataframes
	df_dict = {"overall": leaderboard_df}
	return df_dict


	df_lang = {
	"Task 1": create_df_dict("task1", task1_cols),
	"Task 2": create_df_dict("task2", task2_cols),
	"Task 3": create_df_dict("task3", task3_cols),
	}


	# Constants
	TITLE = '<h1 align="center" id="space-title">🐲 IJCAI 2024 FinLLM Challenge Leaderboard</h1>'
	INTRODUCTION_TEXT = """📊 Introduction

	The FinLLM Challenge rigorously evaluates state-of-the-art models in financial text analysis, generation, and decision-making tasks. These tasks include financial classification, financial text summarization, and single stock trading.

	📈 Unique Evaluation Metrics

	Our leaderboard incorporates a comprehensive evaluation using diverse metrics like Accuracy, F1 Score, ROUGE, BERTScore, and Sharpe Ratio to assess the models' capabilities in real-world financial applications.

	📚 Task Details

	Task 1: Financial Classification

	- Objective: Classify sentences as claims or premises.
	- Dataset: 7.75k training data, 969 test data.
	- Evaluation Metrics: F1 Score (final ranking metric) and Accuracy.

	Task 2: Financial Text Summarization

	- Objective: Summarize financial news articles into concise texts.
	- Dataset: 8k training data, 2k test data.
	- Evaluation Metrics: ROUGE (1, 2, L) and BERTScore (ROUGE-1 as the final ranking metric).

	Task 3: Single Stock Trading

	- Objective: Make stock trading decisions (buy, sell, hold) with reasonings.
	- Dataset: 291 data points.
	- Evaluation Metrics: Sharpe Ratio (final ranking metric), Cumulative Return, Daily and Annualized Volatility, Maximum Drawdown.

	For more details, refer to our [Challenge page](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm?authuser=0).
	"""


	def create_data_interface(df):
	headers = df.columns
	types = ["str"] + ["number"] * (len(headers) - 1)

	return gr.components.Dataframe(
	value=df.values.tolist(),
	headers=[col_name for col_name in headers],
	datatype=types,
	max_rows=10,
	)


	def plot_radar_chart(df, attributes, category_name):
	fig = go.Figure()

	for index, row in df.iterrows():
	model = row["Model"]
	values = row[attributes].tolist()
	fig.add_trace(go.Scatterpolar(r=values, theta=attributes, fill="toself", name=model))

	fig.update_layout(title="FLARE", polar=dict(radialaxis=dict(visible=True, range=[0, 0.9])), showlegend=True)

	return fig


	def create_data_interface_for_aggregated(df, category_name):
	attributes = df.columns[1:]
	print(attributes)
	plt = plot_radar_chart(df, attributes, category_name)
	return plt


	def create_lang_leaderboard(df_dict):
	for key, df in df_dict.items():
	with gr.Tab(key):
	create_data_interface(df)


	def launch_gradio():
	demo = gr.Blocks()

	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
	for key, df_dict in df_lang.items():
	with gr.Tab(key):
	create_lang_leaderboard(df_dict)

	demo.launch()


	scheduler = BackgroundScheduler()
	scheduler.add_job(launch_gradio, "interval", seconds=3600)
	scheduler.start()

	# Launch immediately
	launch_gradio()