import gradio as gr import pandas as pd import numpy as np from time import time from sklearn import metrics from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.decomposition import PCA from huggingface_hub import login from datasets import load_dataset import matplotlib.pyplot as plt # https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py def display_plot(data, n_digits): reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) fig = plt.figure() plt.clf() plt.imshow( Z, interpolation="nearest", extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect="auto", origin="lower", ) plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter( centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3, color="w", zorder=10, ) plt.title( "K-means clustering on the digits dataset (PCA-reduced data)\n" "Centroids are marked with white cross" ) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) return fig def bench_k_means(kmeans, name, data, labels): """Benchmark to evaluate the KMeans initialization methods. Parameters ---------- kmeans : KMeans instance A :class:`~sklearn.cluster.KMeans` instance with the initialization already set. name : str Name given to the strategy. It will be used to show the results in a table. data : ndarray of shape (n_samples, n_features) The data to cluster. labels : ndarray of shape (n_samples,) The labels used to compute the clustering metrics which requires some supervision. """ t0 = time() estimator = make_pipeline(StandardScaler(), kmeans).fit(data) fit_time = time() - t0 results = [name, fit_time, estimator[-1].inertia_] # Define the metrics which require only the true labels and estimator # labels clustering_metrics = [ metrics.homogeneity_score, metrics.completeness_score, metrics.v_measure_score, metrics.adjusted_rand_score, metrics.adjusted_mutual_info_score, ] results += [m(labels, estimator[-1].labels_) for m in clustering_metrics] # The silhouette score requires the full dataset results += [ metrics.silhouette_score( data, estimator[-1].labels_, metric="euclidean", sample_size=300, ) ] return results title = "A demo of K-Means clustering on the handwritten digits data" def do_submit(kmeans_n_digit,random_n_digit, pca_n_digit): # Load the dataset dataset = load_dataset("sklearn-docs/digits", header=None) # convert dataset to pandas df = dataset['train'].to_pandas() data = df.iloc[:, :64] labels = df.iloc[:, 64] kmeans = KMeans(init="k-means++", n_clusters=int(kmeans_n_digit), n_init=4, random_state=0) results = bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels) df = pd.DataFrame(results).T numeric_cols = ['time','inertia','homo','compl','v-meas','ARI','AMI','silhouette'] df.columns = ['init'] + numeric_cols kmeans = KMeans(init="random", n_clusters=int(random_n_digit), n_init=4, random_state=0) results = bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels) df.loc[len(df.index)] = results pca = PCA(n_components=int(pca_n_digit)).fit(data) kmeans = KMeans(init=pca.components_, n_clusters=int(pca_n_digit), n_init=1) results = bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels) df.loc[len(df.index)] = results df[df.columns[1:]] = df.iloc[:,1:].astype(float).round(3) df = df.T #Transpose for display df.columns = df.iloc[0,:].tolist() df = df.iloc[1:,:].reset_index() df.columns = ['metrics', 'k-means++', 'random', 'PCA-based'] return display_plot(data, kmeans_n_digit), df #Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py theme = gr.themes.Monochrome( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", radius_size=gr.themes.sizes.radius_sm, font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"], ) with gr.Blocks(title=title, theme=theme) as demo: gr.Markdown(f"## {title}") gr.Markdown("This demo is based on this [scikit-learn example](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)") gr.Markdown("In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results.") gr.Markdown("As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.") gr.Markdown("Cluster quality metrics evaluated (see [Clustering performance evaluation](https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation) \ for definitions and discussions of the metrics):") gr.Markdown("---") gr.Markdown(" We will be utilizing [digits](https://huggingface.co/datasets/sklearn-docs/digits) dataset. This dataset contains handwritten digits from 0 to 9. \ In the context of clustering, one would like to group images such that the handwritten digits on the image are the same.") with gr.Row(): with gr.Column(scale=0.5): kmeans_n_digit = gr.Slider(minimum=2, maximum=10, label="KMeans n_digits", info="n_digits is number of handwritten digits" , step=1, value=10) random_n_digit = gr.Slider(minimum=2, maximum=10, label="Random n_digits", step=1, value=10) pca_n_digit = gr.Slider(minimum=2, maximum=10, label="PCA n_digits",step=1, value=10) plt_out = gr.Plot() with gr.Column(scale=0.5): sample_df = pd.DataFrame(np.zeros((9,4)),columns=['metrics', 'k-means++', 'random', 'PCA-based']) output = gr.Dataframe(sample_df, label="Clustering Metrics") with gr.Row(): sub_btn = gr.Button("Submit") sub_btn.click(fn=do_submit, inputs=[kmeans_n_digit,random_n_digit, pca_n_digit], outputs=[plt_out, output]) demo.launch()