import os import pandas as pd from sklearn.ensemble import IsolationForest import numpy as np from sklearn.model_selection import train_test_split import gradio as gr import matplotlib.pyplot as plt from skops import hub_utils import pickle import time #Data preparation n_samples, n_outliers = 120, 40 rng = np.random.RandomState(0) covariance = np.array([[0.5, -0.1], [0.7, 0.4]]) cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) # general deformed cluster cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) # spherical cluster outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2)) X = np.concatenate([cluster_1, cluster_2, outliers]) #120+120+40 = 280 with 2D y = np.concatenate( [np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)] ) def load_hf_model_hub(): ''' Load the directory containing pretrained model and files from the model repository ''' repo_id="sklearn-docs/anomaly-detection" download_repo = "downloaded-model" hub_utils.download(repo_id=repo_id, dst=download_repo) time.sleep(2) loaded_model = pickle.load(open('./downloaded-model/isolation_forest.pkl', 'rb')) return loaded_model #Visualize the data as a scatter plot def visualize_input_data(): fig = plt.figure(1, facecolor="w", figsize=(5, 5)) scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k") handles, labels = scatter.legend_elements() plt.axis("square") plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class") plt.title("Gaussian inliers with \nuniformly distributed outliers") return fig title = " An example using IsolationForest for anomaly detection." description1 = "The isolation forest is an Ensemble of Isolation trees and it isolates the data points using recursive random partitioning." description2 = "In case of outliers the number of splits required is greater than those required for inliers." description3 = "We will use the toy dataset as given in the scikit-learn page for Isolation Forest." with gr.Blocks(title=title) as demo: gr.Markdown(f" # {title}") gr.Markdown( """ The isolation forest is an ensemble of isolation trees and it isolates the data points using recursive random partitioning. In case of outliers, the number of splits required is greater than those required for inliers. We will use the toy dataset for our educational demo as given in the scikit-learn page for Isolation Forest. """) gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py).") loaded_model = load_hf_model_hub() with gr.Tab("Visualize Input dataset"): btn = gr.Button(value="Visualize input dataset") with gr.Row(): btn.click(visualize_input_data, outputs= gr.Plot(label='Visualizing input dataset') ) # out = gr.Textbox(label="explaination of the loss function") gr.Markdown( """ ## Data Generation We generate 2 clusters one spherical and the other slightly deformed, from Standard Normal distribution For the sake of consistency inliers are assigned a ground label of 1 and outliers are assigned a label -1. The plot is a visualization of the clusters of the input dataset. """) with gr.Tab("Plot Decision Boundary"): # btn_decision = gr.Button(value="Plot decision boundary") # btn_decision.click(plot_decision_boundary, outputs= gr.Plot(label='Plot decision boundary') ) with gr.Row(): image_decision = gr.Image('./downloaded-model/decision_boundary.png') gr.Markdown( """ ## Plot the Discrete Decision Boundary We plot the discrete decision boundary. The background colour represents whether a sample in that given area is predicted to be an outlier or not. The scatter plot displays the true labels """) with gr.Tab("Plot Path"): with gr.Row(): image_path = gr.Image('./downloaded-model/plot_path.png') gr.Markdown( """ ## Plot the path length of the decision boundary By setting the `response_method="decision_function"`, the background of the `DecisionBoundaryDisplay` represents the measure of the normality of an observation. Normality of Observation = Path Length/Number of Forests of Random Trees The RHS of the above equation is given by the number of splits required to isolate a given sample Such score is given by the path length averaged over a forest of random trees, which itself is given by the depth of the leaf (or equivalently the number of splits) required to isolate a given sample. When a forest of random trees collectively produces short path lengths for isolating some particular samples, they are more likely to have anomalies, and the measure of normality is close to 0. Similarly, large paths correspond to values close to 1 and are more likely to be inliers. """) demo.launch()