Spaces:
Build error
Build error
from typing import OrderedDict | |
import streamlit as st # works on command prompt | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import xgboost as xgb | |
from sklearn.metrics import ( | |
roc_curve, | |
) | |
from sklearn.calibration import calibration_curve | |
from xgboost import plot_tree | |
from views.typing import ModelView | |
def plot_logistic_coeff_barh(coef_dict, x, y): | |
fig = plt.figure(figsize=(x, y)) | |
coef_dict_sorted = dict( | |
sorted(coef_dict.items(), key=lambda item: item[1], reverse=False) | |
) | |
plt.barh(*zip(*coef_dict_sorted.items())) | |
return fig | |
def print_negative_coefficients_logistic_model(coef_dict): | |
# Equal to or less than 0 | |
NegativeCoefficients = dict( | |
filter(lambda x: x[1] <= 0.0, coef_dict.items()) | |
) | |
NegativeCoefficientsSorted = sorted( | |
NegativeCoefficients.items(), key=lambda x: x[1], reverse=False | |
) | |
text = ( | |
"\n\nFeatures the model found to be negatively correlated with probability of default are:" | |
"\n{negative_features}:" | |
) | |
st.markdown(text.format(negative_features=NegativeCoefficientsSorted)) | |
st.markdown(type(NegativeCoefficientsSorted)) | |
st.markdown(NegativeCoefficients.items()) | |
def print_positive_coefficients_logistic_model(coef_dict): | |
# Equal to or greater than 0 | |
PositiveCoefficients = dict( | |
filter(lambda x: x[1] >= 0.0, coef_dict.items()) | |
) | |
PositiveCoefficientsSorted = sorted( | |
PositiveCoefficients.items(), key=lambda x: x[1], reverse=True | |
) | |
text = ( | |
"\n\nFeatures the model found to be positively correlated with probability of default are:" | |
"\n{positive_features}:" | |
) | |
st.markdown(text.format(positive_features=PositiveCoefficientsSorted)) | |
def plot_importance_gbt(clf_gbt_model, barxsize, barysize): | |
axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight") | |
fig1 = axobject1.figure | |
st.write("Feature Importance Plot (Gradient Boosted Tree)") | |
fig1.set_size_inches(barxsize, barysize) | |
return fig1 | |
def download_importance_gbt(fig1, barxsize, barysize): | |
if st.button( | |
"Download Feature Importance Plot as png (Gradient Boosted Tree)" | |
): | |
dpisize = max(barxsize, barysize) | |
plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight") | |
fig1.set_size_inches(barxsize, barysize) | |
def plot_tree_gbt(treexsize, treeysize, clf_gbt_model): | |
plot_tree(clf_gbt_model) | |
fig2 = plt.gcf() | |
fig2.set_size_inches(treexsize, treeysize) | |
return fig2 | |
def download_tree_gbt(treexsize, treeysize): | |
if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"): | |
dpisize = max(treexsize, treeysize) | |
plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight") | |
def cross_validation_graph(cv, eval_metric, trees): | |
# Plot the test AUC scores for each iteration | |
fig = plt.figure() | |
plt.plot(cv[cv.columns[2]]) | |
plt.title( | |
"Test {eval_metric} Score Over {it_numbr} Iterations".format( | |
eval_metric=eval_metric, it_numbr=trees | |
) | |
) | |
plt.xlabel("Iteration Number") | |
plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric)) | |
return fig | |
def recall_accuracy_threshold_tradeoff_fig( | |
widthsize, | |
heightsize, | |
threshold_list, | |
thresh_def_recalls_list, | |
thresh_nondef_recalls_list, | |
thresh_accs_list, | |
): | |
fig = plt.figure(figsize=(widthsize, heightsize)) | |
plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall") | |
plt.plot( | |
threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall" | |
) | |
plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy") | |
plt.xlabel("Probability Threshold") | |
plt.ylabel("Score") | |
plt.xlim(0, 1) | |
plt.ylim(0, 1) | |
plt.legend() | |
plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold") | |
plt.grid(False) | |
return fig | |
def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]): | |
colors = ["blue", "green"] | |
fig = plt.figure() | |
for color_idx, (model_name, model_view) in enumerate(model_views.items()): | |
fpr, tpr, _thresholds = roc_curve( | |
y, model_view.prediction_probability_df | |
) | |
plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}") | |
plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction") | |
model_names = list(model_views.keys()) | |
if not model_names: | |
model_name_str = "None" | |
elif len(model_names) == 1: | |
model_name_str = model_names[0] | |
else: | |
model_name_str = " and ".join( | |
[", ".join(model_names[:-1]), model_names[-1]] | |
) | |
plt.title(f"ROC Chart for {model_name_str} on the Probability of Default") | |
plt.xlabel("False Positive Rate (FP Rate)") | |
plt.ylabel("True Positive Rate (TP Rate)") | |
plt.legend() | |
plt.grid(False) | |
plt.xlim(0, 1) | |
plt.ylim(0, 1) | |
return fig | |
def calibration_curve_report_commented_n( | |
y, model_views: OrderedDict[str, ModelView], bins: int | |
): | |
fig = plt.figure() | |
for model_name, model_view in model_views.items(): | |
frac_of_pos, mean_pred_val = calibration_curve( | |
y, | |
model_view.prediction_probability_df, | |
n_bins=bins, | |
normalize=True, | |
) | |
plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}") | |
# Create the calibration curve plot with the guideline | |
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") | |
plt.ylabel("Fraction of positives") | |
plt.xlabel("Average Predicted Probability") | |
plt.title("Calibration Curve") | |
plt.legend() | |
plt.grid(False) | |
plt.xlim(0, 1) | |
plt.ylim(0, 1) | |
return fig | |
def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins): | |
# Probability distribution | |
probability_stat_distribution = probability_default.describe() | |
# Acceptance rate threshold | |
acc_rate_thresh = np.quantile(probability_default, acceptancerate) | |
fig = plt.figure() | |
plt.hist( | |
probability_default, | |
color="blue", | |
bins=bins, | |
histtype="bar", | |
ec="white", | |
) | |
# Add a reference line to the plot for the threshold | |
plt.axvline(x=acc_rate_thresh, color="red") | |
plt.title("Acceptance Rate Thershold") | |
return ( | |
fig, | |
probability_stat_distribution, | |
acc_rate_thresh, | |
) | |
def streamlit_2columns_metrics_pct_df( | |
column1name_label: str, | |
column2name_label: str, | |
df: pd.DataFrame, | |
): | |
( | |
column1name, | |
column2name, | |
) = st.columns(2) | |
with column1name: | |
st.metric( | |
label=column1name_label, | |
value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]), | |
delta=None, | |
delta_color="normal", | |
) | |
with column2name: | |
st.metric( | |
label=column2name_label, | |
value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]), | |
delta=None, | |
delta_color="normal", | |
) | |
def streamlit_2columns_metrics_df( | |
column1name_label: str, | |
column2name_label: str, | |
df: pd.DataFrame, | |
): | |
( | |
column1name, | |
column2name, | |
) = st.columns(2) | |
with column1name: | |
st.metric( | |
label=column1name_label, | |
value=df.value_counts().get(1), | |
delta=None, | |
delta_color="normal", | |
) | |
with column2name: | |
st.metric( | |
label=column2name_label, | |
value=df.value_counts().get(0), | |
delta=None, | |
delta_color="normal", | |
) | |
def streamlit_2columns_metrics_df_shape(df: pd.DataFrame): | |
( | |
column1name, | |
column2name, | |
) = st.columns(2) | |
with column1name: | |
st.metric( | |
label="Rows", | |
value=df.shape[0], | |
delta=None, | |
delta_color="normal", | |
) | |
with column2name: | |
st.metric( | |
label="Columns", | |
value=df.shape[1], | |
delta=None, | |
delta_color="normal", | |
) | |
def streamlit_2columns_metrics_pct_series( | |
column1name_label: str, | |
column2name_label: str, | |
series: pd.Series, | |
): | |
( | |
column1name, | |
column2name, | |
) = st.columns(2) | |
with column1name: | |
st.metric( | |
label=column1name_label, | |
value="{:.0%}".format(series.get(1) / series.sum()), | |
delta=None, | |
delta_color="normal", | |
) | |
with column2name: | |
st.metric( | |
label=column2name_label, | |
value="{:.0%}".format(series.get(0) / series.sum()), | |
delta=None, | |
delta_color="normal", | |
) | |
def streamlit_2columns_metrics_series( | |
column1name_label: str, | |
column2name_label: str, | |
series: pd.Series, | |
): | |
( | |
column1name, | |
column2name, | |
) = st.columns(2) | |
with column1name: | |
st.metric( | |
label=column1name_label, | |
value=series.get(1), | |
delta=None, | |
delta_color="normal", | |
) | |
with column2name: | |
st.metric( | |
label=column2name_label, | |
value=series.get(0), | |
delta=None, | |
delta_color="normal", | |
) | |
def streamlit_chart_setting_height_width( | |
title: str, | |
default_widthvalue: int, | |
default_heightvalue: int, | |
widthkey: str, | |
heightkey: str, | |
): | |
with st.expander(title): | |
lbarx_col, lbary_col = st.columns(2) | |
with lbarx_col: | |
width_size = st.number_input( | |
label="Width in inches:", | |
value=default_widthvalue, | |
key=widthkey, | |
) | |
with lbary_col: | |
height_size = st.number_input( | |
label="Height in inches:", | |
value=default_heightvalue, | |
key=heightkey, | |
) | |
return width_size, height_size | |