Spaces:

NCSOFT
/

VARCO_Arena

Running

App Files Files Community

VARCO_Arena / view_utils.py

sonsus

others

c2ba4d5 about 1 month ago

raw

history blame contribute delete

13.1 kB

	import math
	import re
	from typing import *

	import numpy as np
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import streamlit as st
	from sklearn.linear_model import LogisticRegression

	from modules.nav import Navbar


	# page related utils
	def default_page_setting(
	layout: Literal["wide", "centered"] = "centered",
	):
	st.set_page_config(page_title="VARCO Arena", layout=layout)
	sidebar_placeholder = st.sidebar.empty()

	css = f"""
	<style>
	.appview-container .main .block-container {{
	padding-top: 32px;
	}}
	[data-testid="stSidebarNav"]>ul {{
	padding-top: 32px;
	}}
	</style>
	"""
	st.markdown(css, unsafe_allow_html=True)
	if "korean" not in st.session_state:
	st.session_state["korean"] = False
	return sidebar_placeholder


	# Function to update is_running and refresh only the sidebar
	def set_nav_bar(is_running: bool, sidebar_placeholder=None, toggle_hashstr: str = None):
	st.session_state["is_running"] = is_running
	# Refresh only the sidebar content
	Navbar(sidebar_placeholder, toggle_hashstr=toggle_hashstr)


	def set_prompt_preview(did_select_prompt: bool, expander_placeholder=None):
	st.session_state["did_select_prompt"] = did_select_prompt


	def show_linebreak_in_md(text: str) -> str:
	return text.replace("\n", " \n") if isinstance(text, str) else "(Empty)"


	def escape_markdown(text: str, version: int = 2, entity_type: str = None) -> str:
	"""
	Helper function to escape telegram markup symbols.

	Args:
	text (:obj:`str`): The text.
	version (:obj:`int` \| :obj:`str`): Use to specify the version of telegrams Markdown.
	Either ``1`` or ``2``. Defaults to ``1``.
	entity_type (:obj:`str`, optional): For the entity types ``PRE``, ``CODE`` and the link
	part of ``TEXT_LINKS``, only certain characters need to be escaped in ``MarkdownV2``.
	See the official API documentation for details. Only valid in combination with
	``version=2``, will be ignored else.
	"""
	if int(version) == 1:
	escape_chars = r"_*`["
	elif int(version) == 2:
	if entity_type in ["pre", "code"]:
	escape_chars = r"\`"
	elif entity_type == "text_link":
	escape_chars = r"\)"
	else:
	escape_chars = r"_*[]()~`>#+-=\|{}.!:"
	else:
	raise ValueError("Markdown version must be either 1 or 2!")

	return re.sub(f"([{re.escape(escape_chars)}])", r"\\\1", text)


	# Elo result related computes
	def compute_relative_winrate_to_1st(elo_df, float_pts: int = 3):
	"""
	Post-processing utility for saving elo table to an excel file. Possibly work as a absolute measure for quality.

	elo_df:
	columns: Model, Elo rating

	add:
	column: relative_winrate_to_1st
	"""
	from functools import partial

	rating1st = elo_df["Elo rating"].max()
	win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
	elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st)

	return elo_df


	def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float:
	# compute P(A wins B) from ratings
	rate_diff = rating_a - rating_b
	win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
	return win_rate


	def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
	if isinstance(df, list):
	df = pd.DataFrame(df)
	df = df.dropna(subset=["winner", "model_a", "model_b"]) # dropping None vs sth

	models = pd.concat([df["model_a"], df["model_b"]]).unique()
	models = pd.Series(np.arange(len(models)), index=models)

	# duplicate battles
	df = pd.concat([df, df], ignore_index=True)
	p = len(models.index)
	n = df.shape[0]

	X = np.zeros([n, p])
	X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
	X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

	# one A win => two A win
	Y = np.zeros(n)
	Y[df["winner"] == "A"] = 1.0

	WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Varco Arena or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
	if (Y == 0).all():
	print(WARNING.format(L=32))
	Y[-1] = 1.0
	elif (Y == 1.0).all():
	print(WARNING.format(L=35))
	Y[-1] = 0.0

	lr = LogisticRegression(fit_intercept=False)
	lr.fit(X, Y)

	elo_scores = SCALE * lr.coef_[0] + INIT_RATING

	elo_scores = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)

	df = (
	pd.DataFrame(
	[[n, round(elo_scores[n], 2)] for n in elo_scores.keys()],
	columns=["Model", "Elo rating"],
	)
	.sort_values("Elo rating", ascending=False)
	.reset_index(drop=True)
	)
	df.index = df.index + 1

	return df


	def fill_missing_values(df, default_value=0):
	"""
	This is used for completing pivot table
	"""
	# 기존 인덱스와 컬럼을 가져옵니다.
	existing_index = set(df.index)
	existing_columns = set(df.columns)

	# 모든 가능한 인덱스와 컬럼을 가져옵니다.
	all_index = set(df.index.union(df.columns))
	all_columns = set(df.index.union(df.columns))

	# 기본값으로 누락된 행과 열을 채웁니다.
	missing_index = all_index - existing_index
	missing_columns = all_columns - existing_columns

	# 누락된 행을 기본값으로 추가합니다.
	for idx in missing_index:
	df.loc[idx] = default_value

	# 누락된 열을 기본값으로 추가합니다.
	for col in missing_columns:
	df[col] = default_value

	# 인덱스와 컬럼을 다시 정렬합니다.
	df.sort_index(axis=0, inplace=True)
	df.sort_index(axis=1, inplace=True)

	return df


	def _plot_length_bias(results, judgename: str = None, ratio: bool = True):
	if not isinstance(results, pd.DataFrame):
	results = pd.DataFrame.from_dict(results)

	if ratio:

	def _win_to_loss_wc_ratio(row):
	try:
	if row.winner == "A":
	ratio = len(row.generated_a.split()) / len(row.generated_b.split())
	else:
	ratio = len(row.generated_b.split()) / len(row.generated_a.split())
	except Exception as e:
	ratio = None
	return ratio

	df = results
	df["ratio"] = df.apply(_win_to_loss_wc_ratio, axis=1)
	df["category"] = "win/loss wc ratio"

	# Create the box plot
	plot_df = df.drop(
	columns=[col for col in df if col not in ["category", "ratio"]]
	)
	fig = px.violin(
	plot_df,
	x="category",
	y="ratio",
	# log_y=True,
	title=f"Length bias ({judgename})",
	# labels={"category": "win/loss wc ratio", "ratio": "ratio"},
	)

	else:
	data = []
	for _, row in results.iterrows():
	data.append(
	{
	"category": "won",
	"wordcounts": len(row.generated_a.split())
	if row["winner"] == "A"
	else len(row.generated_b.split()),
	}
	)
	data.append(
	{
	"category": "lost",
	"wordcounts": len(row.generated_b.split())
	if row["winner"] == "A"
	else len(row.generated_a.split()),
	}
	)
	data.append(
	{
	"category": "won/lost ratio",
	"wordcounts": len(row.generated_a.split())
	/ len(row.generated_b.split()) # a won
	if row["winner"] == "A"
	else len(row.generated_b.split())
	/ len(row.generated_a.split()), # b won
	}
	)

	plot_df = pd.DataFrame(data)

	# Create the box plot
	fig = px.violin(
	plot_df,
	x="category",
	y="wordcounts",
	# log_y=True,
	title=f"Length bias ({judgename})",
	labels={"category": "outcome", "wordcount": "wordcount"},
	)

	return fig, plot_df


	def visualization(results, is_overall=False):
	"""
	varco_arena/visualization.py 로부터 가져온 함수이나 업데이트가 많이 되었으므로 조심!
	"""
	if not isinstance(results, pd.DataFrame):
	results = pd.DataFrame.from_dict(results)

	figure_dict = {}
	judgename = results.iloc[0]["evaluation_model"]

	# judge bias of length
	fig, plot_df = _plot_length_bias(results, judgename=judgename)
	figure_dict["length_bias"] = fig
	figure_dict["length_bias_df"] = plot_df

	# Judge bias of Position A/B
	fig = px.bar(
	results["winner"].value_counts(),
	title=f"Position A/B bias\n({judgename})",
	text_auto=True,
	height=400,
	)
	fig.update_layout(xaxis_title="Match Winner", yaxis_title="Count", showlegend=False)
	figure_dict["counts_of_match_winners"] = fig

	# Num. matches of each model
	fig = px.bar(
	pd.concat([results["model_a"], results["model_b"]]).value_counts(),
	title="Match Count per Model",
	text_auto=True,
	)
	fig.update_layout(
	xaxis_title="Model", yaxis_title="Match Count", height=400, showlegend=False
	)
	figure_dict["match_count_for_each_model"] = fig

	# Num. matches matrix (model v. model)
	ptbl = pd.pivot_table(
	results,
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)
	match_counts = ptbl + ptbl.T
	ordering = match_counts.sum().sort_values(ascending=False).index
	fig = px.imshow(
	match_counts.loc[ordering, ordering],
	title="Number of Matches (model vs. model)",
	text_auto=True,
	)
	fig.update_layout(
	xaxis_title="Model B",
	yaxis_title="Model A",
	xaxis_side="top",
	height=800,
	width=800,
	title_xanchor="left",
	title_yanchor="top",
	font=dict(size=10),
	)
	fig.update_traces(
	hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>"
	)
	figure_dict["match_count_of_each_combination_of_models"] = fig

	# Win rate matrix (model v. model)
	a_win_ptbl = pd.pivot_table(
	results[results["winner"] == "A"],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)
	a_win_ptbl = fill_missing_values(a_win_ptbl)
	b_win_ptbl = pd.pivot_table(
	results[results["winner"] == "B"],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)
	b_win_ptbl = fill_missing_values(b_win_ptbl)
	num_results_ptbl = pd.pivot_table(
	results, index="model_a", columns="model_b", aggfunc="size", fill_value=0
	)

	row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (
	num_results_ptbl + num_results_ptbl.T
	)
	prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
	model_names = list(prop_wins.keys())

	row_beats_col = row_beats_col_freq.loc[model_names, model_names]
	fig = px.imshow(
	row_beats_col,
	color_continuous_scale="RdBu",
	text_auto=".2f",
	title="P(A wins B)",
	)
	fig.update_layout(
	xaxis_title="Model B",
	yaxis_title="Model A", # y axis = row = index
	title_xanchor="left",
	title_yanchor="top",
	xaxis_side="top",
	height=800,
	width=800,
	)
	fig.update_traces(
	hovertemplate="Model A: %{y}<br>Model B: %{x}<br>P(A wins B): %{z}<extra></extra>"
	)
	figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig

	# Elo Rating
	elo = compute_mle_elo(results)
	elo_wr = compute_relative_winrate_to_1st(elo)
	# beautify
	elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
	elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
	elo_wr.index.name = "Rank"

	figure_dict["elo_rating"] = elo_wr

	# Elo Rating by Task: Radar chart
	if is_overall:
	tasks = results["task"].unique().tolist()
	elo_by_task = pd.concat(
	[
	compute_mle_elo(results[results["task"] == task]).assign(task=task)
	for task in tasks
	]
	)
	fig = px.line_polar(
	elo_by_task,
	r="Elo rating",
	theta="task",
	line_close=True,
	category_orders={"task": tasks},
	color="Model",
	markers=True,
	color_discrete_sequence=px.colors.qualitative.Pastel,
	title="Elo Rating by Task",
	)
	figure_dict["elo_rating_by_task"] = fig
	figure_dict["judgename"] = judgename

	return figure_dict