Nahrawy's picture
Update app.py
3e39e30
raw
history blame
5.41 kB
import time
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import shuffle
def load_mnist(classes, n_samples):
"""Load MNIST, select two classes, shuffle and return only n_samples."""
# Load data from http://openml.org/d/554
mnist = fetch_openml("mnist_784", version=1, as_frame=False, parser="pandas")
# take only two classes for binary classification
mask = np.in1d(mnist.target, classes)
X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42)
X, y = X[:n_samples], y[:n_samples]
return X, y
@ignore_warnings(category=ConvergenceWarning)
def fit_and_score(estimator, max_iter, X_train, X_test, y_train, y_test):
"""Fit the estimator on the train set and score it on both sets"""
estimator.set_params(max_iter=max_iter)
estimator.set_params(random_state=0)
start = time.time()
estimator.fit(X_train, y_train)
fit_time = time.time() - start
n_iter = estimator.n_iter_
train_score = estimator.score(X_train, y_train)
test_score = estimator.score(X_test, y_test)
return fit_time, n_iter, train_score, test_score
def plot(classes, max_iterations, num_samples, n_iter_no_change, validation_fraction, tol):
if len(classes) <2:
raise gr.Error(f'Invalid number of classes (Numbers to be included in training)')
max_iterations = int(max_iterations)
num_samples = int(num_samples)
n_iter_no_change = int(n_iter_no_change)
validation_fraction = float(validation_fraction)
tol = float(tol)
# Define the estimators to compare
estimator_dict = {
"No stopping criterion": linear_model.SGDClassifier(n_iter_no_change=n_iter_no_change),
"Training loss": linear_model.SGDClassifier(
early_stopping=False, n_iter_no_change=n_iter_no_change, tol=0.1
),
"Validation score": linear_model.SGDClassifier(
early_stopping=True, n_iter_no_change=n_iter_no_change, tol=tol, validation_fraction=validation_fraction
),
}
# Load the dataset
X, y = load_mnist(classes, n_samples=num_samples)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
results = []
for estimator_name, estimator in estimator_dict.items():
for max_iter in range(1, max_iterations):
fit_time, n_iter, train_score, test_score = fit_and_score(
estimator, max_iter, X_train, X_test, y_train, y_test
)
results.append(
(estimator_name, max_iter, fit_time, n_iter, train_score, test_score)
)
# Transform the results in a pandas dataframe for easy plotting
columns = [
"Stopping criterion",
"max_iter",
"Fit time (sec)",
"n_iter_",
"Train score",
"Test score",
]
results_df = pd.DataFrame(results, columns=columns)
# Define what to plot
lines = "Stopping criterion"
x_axis = "max_iter"
styles = ["-.", "--", "-"]
# First plot: train and test scores
fig1, axes1 = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 4))
for ax, y_axis in zip(axes1, ["Train score", "Test score"]):
for style, (criterion, group_df) in zip(styles, results_df.groupby(lines)):
group_df.plot(x=x_axis, y=y_axis, label=criterion, ax=ax, style=style)
ax.set_title(y_axis)
ax.legend(title=lines)
fig1.tight_layout()
# Second plot: n_iter and fit time
fig2, axes2 = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
for ax, y_axis in zip(axes2, ["n_iter_", "Fit time (sec)"]):
for style, (criterion, group_df) in zip(styles, results_df.groupby(lines)):
group_df.plot(x=x_axis, y=y_axis, label=criterion, ax=ax, style=style)
ax.set_title(y_axis)
ax.legend(title=lines)
fig2.tight_layout()
return fig1, fig2
with gr.Blocks() as demo:
gr.Markdown(info)
with gr.Row():
with gr.Column():
classes = gr.CheckboxGroup(["0", "1", "2","3","4","5","6","7","8","9"], value=['0','8'],label="Classes", info="Numbers to include in the training, for fast and stable training please choose 2 classes only")
max_iterations = gr.Slider(label="Max Number of Iterations", value="50", minimum=5, maximum=50, step=1, info="Max Number of iterations to run SGD")
num_samples = gr.Slider(label="Number of Samples", value="10000", minimum=1000, maximum=70000, step=100, info="Number of samples to include in the training")
n_iter_no_change = gr.Slider(label="Number of Iterations with No Change", value="3", minimum=1, maximum=10, step=1, info="Maximum number of iterations with no score improvement by at leat tol, before stopping")
validation_fraction = gr.Slider(label="Validation Fraction", value="0.2", minimum=0.05, maximum=0.9, step=0.01, info="Fraction of the training data to be used for validation")
tol = gr.Textbox(label='Stopping Criterion', value="0.0001",info="The minimum improvement of score to be considered")
out1 = gr.Plot()
out2 = gr.Plot()
btn = gr.Button("Run")
btn.click(fn=plot, inputs=[classes, max_iterations, num_samples, n_iter_no_change, validation_fraction, tol], outputs=[out1, out2])
demo.launch()