Paul Kiage
Hugging Face Deployment Setup (#11)
7d861ad unverified
raw
history blame
9.93 kB
from typing import OrderedDict
import streamlit as st # works on command prompt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import (
roc_curve,
)
from sklearn.calibration import calibration_curve
from xgboost import plot_tree
from views.typing import ModelView
def plot_logistic_coeff_barh(coef_dict, x, y):
fig = plt.figure(figsize=(x, y))
coef_dict_sorted = dict(
sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
)
plt.barh(*zip(*coef_dict_sorted.items()))
return fig
def print_negative_coefficients_logistic_model(coef_dict):
# Equal to or less than 0
NegativeCoefficients = dict(
filter(lambda x: x[1] <= 0.0, coef_dict.items())
)
NegativeCoefficientsSorted = sorted(
NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
)
text = (
"\n\nFeatures the model found to be negatively correlated with probability of default are:"
"\n{negative_features}:"
)
st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
st.markdown(type(NegativeCoefficientsSorted))
st.markdown(NegativeCoefficients.items())
def print_positive_coefficients_logistic_model(coef_dict):
# Equal to or greater than 0
PositiveCoefficients = dict(
filter(lambda x: x[1] >= 0.0, coef_dict.items())
)
PositiveCoefficientsSorted = sorted(
PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
)
text = (
"\n\nFeatures the model found to be positively correlated with probability of default are:"
"\n{positive_features}:"
)
st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
fig1 = axobject1.figure
st.write("Feature Importance Plot (Gradient Boosted Tree)")
fig1.set_size_inches(barxsize, barysize)
return fig1
def download_importance_gbt(fig1, barxsize, barysize):
if st.button(
"Download Feature Importance Plot as png (Gradient Boosted Tree)"
):
dpisize = max(barxsize, barysize)
plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
fig1.set_size_inches(barxsize, barysize)
def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
plot_tree(clf_gbt_model)
fig2 = plt.gcf()
fig2.set_size_inches(treexsize, treeysize)
return fig2
def download_tree_gbt(treexsize, treeysize):
if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
dpisize = max(treexsize, treeysize)
plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
def cross_validation_graph(cv, eval_metric, trees):
# Plot the test AUC scores for each iteration
fig = plt.figure()
plt.plot(cv[cv.columns[2]])
plt.title(
"Test {eval_metric} Score Over {it_numbr} Iterations".format(
eval_metric=eval_metric, it_numbr=trees
)
)
plt.xlabel("Iteration Number")
plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
return fig
def recall_accuracy_threshold_tradeoff_fig(
widthsize,
heightsize,
threshold_list,
thresh_def_recalls_list,
thresh_nondef_recalls_list,
thresh_accs_list,
):
fig = plt.figure(figsize=(widthsize, heightsize))
plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
plt.plot(
threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
)
plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
plt.xlabel("Probability Threshold")
plt.ylabel("Score")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.legend()
plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
plt.grid(False)
return fig
def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
colors = ["blue", "green"]
fig = plt.figure()
for color_idx, (model_name, model_view) in enumerate(model_views.items()):
fpr, tpr, _thresholds = roc_curve(
y, model_view.prediction_probability_df
)
plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
model_names = list(model_views.keys())
if not model_names:
model_name_str = "None"
elif len(model_names) == 1:
model_name_str = model_names[0]
else:
model_name_str = " and ".join(
[", ".join(model_names[:-1]), model_names[-1]]
)
plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
plt.xlabel("False Positive Rate (FP Rate)")
plt.ylabel("True Positive Rate (TP Rate)")
plt.legend()
plt.grid(False)
plt.xlim(0, 1)
plt.ylim(0, 1)
return fig
def calibration_curve_report_commented_n(
y, model_views: OrderedDict[str, ModelView], bins: int
):
fig = plt.figure()
for model_name, model_view in model_views.items():
frac_of_pos, mean_pred_val = calibration_curve(
y,
model_view.prediction_probability_df,
n_bins=bins,
normalize=True,
)
plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
# Create the calibration curve plot with the guideline
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.ylabel("Fraction of positives")
plt.xlabel("Average Predicted Probability")
plt.title("Calibration Curve")
plt.legend()
plt.grid(False)
plt.xlim(0, 1)
plt.ylim(0, 1)
return fig
def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
# Probability distribution
probability_stat_distribution = probability_default.describe()
# Acceptance rate threshold
acc_rate_thresh = np.quantile(probability_default, acceptancerate)
fig = plt.figure()
plt.hist(
probability_default,
color="blue",
bins=bins,
histtype="bar",
ec="white",
)
# Add a reference line to the plot for the threshold
plt.axvline(x=acc_rate_thresh, color="red")
plt.title("Acceptance Rate Thershold")
return (
fig,
probability_stat_distribution,
acc_rate_thresh,
)
def streamlit_2columns_metrics_pct_df(
column1name_label: str,
column2name_label: str,
df: pd.DataFrame,
):
(
column1name,
column2name,
) = st.columns(2)
with column1name:
st.metric(
label=column1name_label,
value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
delta=None,
delta_color="normal",
)
with column2name:
st.metric(
label=column2name_label,
value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
delta=None,
delta_color="normal",
)
def streamlit_2columns_metrics_df(
column1name_label: str,
column2name_label: str,
df: pd.DataFrame,
):
(
column1name,
column2name,
) = st.columns(2)
with column1name:
st.metric(
label=column1name_label,
value=df.value_counts().get(1),
delta=None,
delta_color="normal",
)
with column2name:
st.metric(
label=column2name_label,
value=df.value_counts().get(0),
delta=None,
delta_color="normal",
)
def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
(
column1name,
column2name,
) = st.columns(2)
with column1name:
st.metric(
label="Rows",
value=df.shape[0],
delta=None,
delta_color="normal",
)
with column2name:
st.metric(
label="Columns",
value=df.shape[1],
delta=None,
delta_color="normal",
)
def streamlit_2columns_metrics_pct_series(
column1name_label: str,
column2name_label: str,
series: pd.Series,
):
(
column1name,
column2name,
) = st.columns(2)
with column1name:
st.metric(
label=column1name_label,
value="{:.0%}".format(series.get(1) / series.sum()),
delta=None,
delta_color="normal",
)
with column2name:
st.metric(
label=column2name_label,
value="{:.0%}".format(series.get(0) / series.sum()),
delta=None,
delta_color="normal",
)
def streamlit_2columns_metrics_series(
column1name_label: str,
column2name_label: str,
series: pd.Series,
):
(
column1name,
column2name,
) = st.columns(2)
with column1name:
st.metric(
label=column1name_label,
value=series.get(1),
delta=None,
delta_color="normal",
)
with column2name:
st.metric(
label=column2name_label,
value=series.get(0),
delta=None,
delta_color="normal",
)
def streamlit_chart_setting_height_width(
title: str,
default_widthvalue: int,
default_heightvalue: int,
widthkey: str,
heightkey: str,
):
with st.expander(title):
lbarx_col, lbary_col = st.columns(2)
with lbarx_col:
width_size = st.number_input(
label="Width in inches:",
value=default_widthvalue,
key=widthkey,
)
with lbary_col:
height_size = st.number_input(
label="Height in inches:",
value=default_heightvalue,
key=heightkey,
)
return width_size, height_size