|
|
|
import gradio as gr |
|
import matplotlib |
|
import numpy as np |
|
import pandas as pd |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
TASK1_COLS = [ |
|
("Model", "str"), |
|
("Acc", "number"), |
|
("F1", "number"), |
|
("MCC", "number"), |
|
] |
|
|
|
TASK2_COLS = [ |
|
("Model", "str"), |
|
("Rouge-1", "number"), |
|
("Rouge-2", "number"), |
|
("Rouge-L", "number"), |
|
("BertScore", "number"), |
|
("BartScore", "number"), |
|
] |
|
|
|
TASK3_COLS = [ |
|
("Model", "str"), |
|
("Sharpe Ratio", "number"), |
|
("Sharpe Ratio - DRIV", "number"), |
|
("Sharpe Ratio - FORM", "number"), |
|
("Sharpe Ratio - JNJ", "number"), |
|
("Sharpe Ratio - MSFT", "number"), |
|
] |
|
|
|
|
|
|
|
task1_cols = [col_name for col_name, _ in TASK1_COLS] |
|
task2_cols = [col_name for col_name, _ in TASK2_COLS] |
|
task3_cols = [col_name for col_name, _ in TASK3_COLS] |
|
|
|
|
|
def create_df_dict(lang, lang_cols): |
|
|
|
leaderboard_df = pd.read_csv(f"{lang}_result.csv", names=lang_cols) |
|
leaderboard_df = leaderboard_df.sort_index(axis=1) |
|
|
|
leaderboard_df = leaderboard_df[["Model"] + [col for col in leaderboard_df.columns if col != "Model"]] |
|
cols = leaderboard_df.columns |
|
types = ["str"] + ["number"] * (len(lang_cols) - 1) |
|
|
|
|
|
df_dict = {"overall": leaderboard_df} |
|
return df_dict |
|
|
|
|
|
df_lang = { |
|
"Task 1": create_df_dict("task1", task1_cols), |
|
"Task 2": create_df_dict("task2", task2_cols), |
|
"Task 3": create_df_dict("task3", task3_cols), |
|
} |
|
|
|
|
|
|
|
TITLE = '<h1 align="center" id="space-title">π² IJCAI 2024 FinLLM Challenge Leaderboard</h1>' |
|
INTRODUCTION_TEXT = """π Introduction |
|
|
|
The FinLLM Challenge rigorously evaluates state-of-the-art models in financial text analysis, generation, and decision-making tasks. These tasks include financial classification, financial text summarization, and single stock trading. |
|
|
|
π Unique Evaluation Metrics |
|
|
|
Our leaderboard incorporates a comprehensive evaluation using diverse metrics like Accuracy, F1 Score, ROUGE, BERTScore, and Sharpe Ratio to assess the models' capabilities in real-world financial applications. |
|
|
|
π Task Details |
|
|
|
**Task 1: Financial Classification** |
|
|
|
- **Objective:** Classify sentences as claims or premises. |
|
- **Dataset:** 7.75k training data, 969 test data. |
|
- **Evaluation Metrics:** F1 Score (final ranking metric) and Accuracy. |
|
|
|
**Task 2: Financial Text Summarization** |
|
|
|
- **Objective:** Summarize financial news articles into concise texts. |
|
- **Dataset:** 8k training data, 2k test data. |
|
- **Evaluation Metrics:** ROUGE (1, 2, L) and BERTScore (ROUGE-1 as the final ranking metric). |
|
|
|
**Task 3: Single Stock Trading** |
|
|
|
- **Objective:** Make stock trading decisions (buy, sell, hold) with reasonings. |
|
- **Dataset:** 291 data points. |
|
- **Evaluation Metrics:** Sharpe Ratio (final ranking metric), Cumulative Return, Daily and Annualized Volatility, Maximum Drawdown. |
|
|
|
For more details, refer to our [Challenge page](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm?authuser=0). |
|
""" |
|
|
|
|
|
def create_data_interface(df): |
|
headers = df.columns |
|
types = ["str"] + ["number"] * (len(headers) - 1) |
|
|
|
return gr.components.Dataframe( |
|
value=df.values.tolist(), |
|
headers=[col_name for col_name in headers], |
|
datatype=types, |
|
max_rows=10, |
|
) |
|
|
|
|
|
def plot_radar_chart(df, attributes, category_name): |
|
fig = go.Figure() |
|
|
|
for index, row in df.iterrows(): |
|
model = row["Model"] |
|
values = row[attributes].tolist() |
|
fig.add_trace(go.Scatterpolar(r=values, theta=attributes, fill="toself", name=model)) |
|
|
|
fig.update_layout(title="FLARE", polar=dict(radialaxis=dict(visible=True, range=[0, 0.9])), showlegend=True) |
|
|
|
return fig |
|
|
|
|
|
def create_data_interface_for_aggregated(df, category_name): |
|
attributes = df.columns[1:] |
|
print(attributes) |
|
plt = plot_radar_chart(df, attributes, category_name) |
|
return plt |
|
|
|
|
|
def create_lang_leaderboard(df_dict): |
|
for key, df in df_dict.items(): |
|
with gr.Tab(key): |
|
create_data_interface(df) |
|
|
|
|
|
def launch_gradio(): |
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
for key, df_dict in df_lang.items(): |
|
with gr.Tab(key): |
|
create_lang_leaderboard(df_dict) |
|
|
|
demo.launch() |
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(launch_gradio, "interval", seconds=3600) |
|
scheduler.start() |
|
|
|
|
|
launch_gradio() |
|
|