Spaces:
Runtime error
Runtime error
import sys | |
tabpfn_path = 'TabPFN' | |
sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618) | |
from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier | |
from decision_boundary import DecisionBoundaryDisplay | |
import numpy as np | |
from pathlib import Path | |
import pandas as pd | |
import torch | |
import gradio as gr | |
import openml | |
import os | |
import matplotlib.pyplot as plt | |
from matplotlib.colors import ListedColormap | |
default_device = "cuda:0" if torch.cuda.is_available() else "cpu:0" | |
classifier = TabPFNClassifier(base_path=tabpfn_path, device=default_device, N_ensemble_configurations=4) | |
def compute(df_table): | |
headers = df_table.columns | |
table = df_table.to_numpy() | |
vfunc = np.vectorize(lambda s: len(str(s))) | |
non_empty_row_mask = (vfunc(table).sum(1) != 0) | |
table = table[non_empty_row_mask] | |
empty_mask = table == '' | |
empty_inds = np.where(empty_mask) | |
if table.shape[0] > 1024: | |
return "⚠️ **ERROR: TabPFN is not made for datasets with a trainingsize > 1024.**", None, None | |
if table.shape[1] > 100: | |
return "⚠️ **ERROR: TabPFN is not made for datasets with a feature size > 100.**", None, None | |
if not len(empty_inds[0]): | |
return "⚠️ **ERROR: Please leave at least one field blank for prediction.**", None, None | |
if not np.all(empty_inds[1][0] == empty_inds[1]): | |
return "⚠️ **Please only leave fields of one column blank for prediction.**", None, None | |
y_column = empty_inds[1][0] | |
eval_lines = empty_inds[0] | |
train_table = np.delete(table, eval_lines, axis=0) | |
eval_table = table[eval_lines] | |
try: | |
x_train = torch.tensor(np.delete(train_table, y_column, axis=1).astype(np.float32)) | |
x_eval = torch.tensor(np.delete(eval_table, y_column, axis=1).astype(np.float32)) | |
#y_train = train_table[:, y_column] | |
y_train = np.array(train_table[:, y_column].tolist()) | |
except ValueError: | |
return "⚠️ **Please only add numbers (to the inputs) or leave fields empty.**", None, None | |
print(y_train) | |
print(y_train[:10],y_train[-10:]) | |
classifier.fit(x_train, y_train) | |
y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True) | |
# print(file, type(file)) | |
out_table = pd.DataFrame(table.copy().astype(str)) | |
out_table.iloc[eval_lines, y_column] = [f"{y_e} (p={p_e:.2f})" for y_e, p_e in zip(y_eval, p_eval)] | |
out_table = out_table.iloc[eval_lines, :] | |
out_table.columns = headers | |
# PLOTTING | |
fig = plt.figure(figsize=(10,10)) | |
ax = fig.add_subplot(111) | |
cm = plt.cm.RdBu | |
cm_bright = ListedColormap(["#FF0000", "#0000FF"]) | |
# Plot the training points | |
vfunc = np.vectorize(lambda x : np.where(classifier.classes_ == x)[0]) | |
y_train_index = vfunc(y_train) | |
y_train_index = y_train_index == 0 | |
y_train = y_train_index | |
#x_train = x_train[y_train_index <= 1] | |
#y_train = y_train[y_train_index <= 1] | |
#y_train_index = y_train_index[y_train_index <= 1] | |
ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train_index, cmap=cm_bright) | |
classifier.fit(x_train[:, 0:2], y_train) | |
DecisionBoundaryDisplay.from_estimator( | |
classifier, x_train[:, 0:2], alpha=0.6, ax=ax, eps=2.0, grid_resolution=25, response_method="predict_proba" | |
) | |
plt.xlabel(headers[0]) | |
plt.ylabel(headers[1]) | |
return "The plot visualizes a predictor based on only two features and for two classes. The tabular results below are based on the full dataset.\nThis demo is running on a CPU only and with 4 ensemble members (32 in the paper).", out_table, fig | |
def upload_file(file, remove_entries=10): | |
if file.name.endswith('.arff'): | |
dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name) | |
X_, _, categorical_indicator_, attribute_names_ = dataset.get_data( | |
dataset_format="array" | |
) | |
df = pd.DataFrame(X_, columns=attribute_names_) | |
headers = np.arange(len(df.columns)) | |
df.columns = headers | |
elif file.name.endswith('.csv') or file.name.endswith('.data'): | |
df = pd.read_csv(file.name, header='infer') | |
headers = np.arange(len(df.columns)) | |
df.columns = headers | |
df.iloc[0:remove_entries, -1] = '' | |
return df | |
def update_table(table): | |
vfunc = np.vectorize(lambda s: len(str(s))) | |
non_empty_row_mask = (vfunc(table).sum(1) != 0) | |
table = table[non_empty_row_mask] | |
empty_mask = table == '' | |
empty_inds = np.where(empty_mask) | |
if not len(empty_inds[0]): | |
return table | |
y_column = empty_inds[1][0] | |
eval_lines = empty_inds[0] | |
table.iloc[eval_lines, y_column] = '' | |
return table | |
gr.Markdown("""This demo allows you to experiment with the **TabPFN** model for tabular data. | |
If you remove values in the target column, TabPFN will make predictions on them after clicking on the Button. The first 10 target values were already removed for this example dataset, so TabPFN will predict the first 10 classes. | |
Please, provide everything but the targets as numeric values and only remove values in one column (the target column). | |
""") | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
inp_table = gr.DataFrame(type='pandas', value=upload_file(Path('iris.csv'), remove_entries=10) | |
, headers=[''] * 5) | |
inp_file = gr.File( | |
label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.') | |
examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'], | |
inputs=[inp_file], | |
outputs=[inp_table], | |
fn=upload_file, | |
cache_examples=True) | |
#inp_table.change(fn=update_table, inputs=inp_table, outputs=inp_table) | |
with gr.Column(): | |
btn = gr.Button("Calculate Predictions") | |
out_text = gr.Markdown() | |
out_plot = gr.Plot(type="Matplotlib") | |
out_table = gr.DataFrame() | |
btn.click(fn=compute, inputs=inp_table, outputs=[out_text, out_table, out_plot]) | |
inp_file.change(fn=upload_file, inputs=inp_file, outputs=inp_table) | |
demo.launch() |