Spaces:

TabPFN
/

TabPFNEvaluation

Build error

App Files Files Community

yesssssssss commited on Aug 1, 2022

Commit

5faa10b

1 Parent(s): 542daa5

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitmodules +0 -0
README.md +5 -6
TabPFN/PrepareDatasets.ipynb +373 -0
TabPFN/README.md +23 -0
TabPFN/SyntheticGPAblation.ipynb +392 -0
TabPFN/TabPFNPredictionOnly.ipynb +253 -0
TabPFN/TabularEvaluationVisualization.ipynb +0 -0
TabPFN/TrainingTuningAndPrediction.ipynb +0 -0
TabPFN/__pycache__/encoders.cpython-39.pyc +0 -0
TabPFN/__pycache__/layer.cpython-39.pyc +0 -0
TabPFN/__pycache__/model_builder.cpython-39.pyc +0 -0
TabPFN/__pycache__/notebook_utils.cpython-39.pyc +0 -0
TabPFN/__pycache__/positional_encodings.cpython-39.pyc +0 -0
TabPFN/__pycache__/train.cpython-39.pyc +0 -0
TabPFN/__pycache__/transformer.cpython-39.pyc +0 -0
TabPFN/__pycache__/utils.cpython-39.pyc +0 -0
TabPFN/datasets/__init__.py +149 -0
TabPFN/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
TabPFN/datasets/utils.py +8 -0
TabPFN/decoders.py +30 -0
TabPFN/differentiable_pfn_evaluation.py +345 -0
TabPFN/encoders.py +225 -0
TabPFN/initializers.py +9 -0
TabPFN/layer.py +125 -0
TabPFN/losses.py +41 -0
TabPFN/model_builder.py +273 -0
TabPFN/models_diff/gp_ablation_model.cpkt +3 -0
TabPFN/models_diff/prior_diff_real_checkpoint_n_8x_lr0.0003_epoch_49.cpkt +3 -0
TabPFN/notebook_utils.py +32 -0
TabPFN/positional_encodings.py +70 -0
TabPFN/prior_tuning_result.pkl +3 -0
TabPFN/priors/__init__.py +4 -0
TabPFN/priors/__pycache__/__init__.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/differentiable_prior.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/fast_gp.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/flexible_categorical.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/mlp.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/prior.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/prior_bag.cpython-39.pyc +0 -0
TabPFN/priors/__pycache__/utils.cpython-39.pyc +0 -0
TabPFN/priors/differentiable_prior.py +293 -0
TabPFN/priors/fast_gp.py +144 -0
TabPFN/priors/flexible_categorical.py +240 -0
TabPFN/priors/mlp.py +173 -0
TabPFN/priors/prior.py +12 -0
TabPFN/priors/prior_bag.py +32 -0
TabPFN/priors/utils.py +163 -0
TabPFN/requirements.txt +15 -0
TabPFN/scripts/__pycache__/tabular_baselines.cpython-39.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -29,3 +29,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.cpkt filter=lfs diff=lfs merge=lfs -text

.gitmodules ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: TabPFNEvaluation
-emoji: 😻
-colorFrom: purple
-colorTo: purple
 sdk: gradio
 sdk_version: 3.1.1
 app_file: app.py
-pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TabPFNEvaluationDemo
+emoji: 🏢
+colorFrom: blue
+colorTo: red
 sdk: gradio
 sdk_version: 3.1.1
 app_file: app.py
+pinned: true
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

TabPFN/PrepareDatasets.ipynb ADDED Viewed

	@@ -0,0 +1,373 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import openml\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "from datasets import load_openml_list, test_dids_classification, valid_large_classification, open_cc_dids, open_cc_valid_dids\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Prepare test datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "renamer = {'name': 'Name', 'NumberOfFeatures': '# Features', 'NumberOfSymbolicFeatures': '# Categorical Features', 'NumberOfInstances': '# Instances', 'NumberOfMissingValues': '# NaNs', 'NumberOfClasses': '# Classes', 'MinorityClassSize': 'Minority Class Size'}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OrderedDict([(99,\n",
+       "              {'id': 99,\n",
+       "               'alias': 'OpenML-CC18',\n",
+       "               'main_entity_type': 'task',\n",
+       "               'name': 'OpenML-CC18 Curated Classification benchmark',\n",
+       "               'status': 'active',\n",
+       "               'creation_date': '2019-02-21 18:47:13',\n",
+       "               'creator': 1}),\n",
+       "             (225,\n",
+       "              {'id': 225,\n",
+       "               'alias': 'OpenML-friendly',\n",
+       "               'main_entity_type': 'task',\n",
+       "               'name': 'OpenML100-friendly',\n",
+       "               'status': 'active',\n",
+       "               'creation_date': '2019-09-16 19:41:46',\n",
+       "               'creator': 1})])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "openml.study.list_suites()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "suite = openml.study.get_suite(suite_id=99)\n",
+    "tasks = openml.tasks.list_tasks(output_format=\"dataframe\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Using ``@`` in `pd.DataFrame.query <\n",
+    "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_\n",
+    "# accesses variables outside of the current dataframe.\n",
+    "tasks = tasks.query(\"tid in @suite.tasks\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tids = list(tasks[np.logical_and(np.logical_and((tasks.NumberOfInstances <= 2000), (tasks.NumberOfFeatures <= 100))\n",
+    "                                 , (tasks.NumberOfClasses <= 10))].tid)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "30"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(tids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tids = list(tasks[tasks.NumberOfInstances <= 2000].tid)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "open_cc_dids = [openml.tasks.get_task(task_id).get_dataset().id for task_id in tids]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "open_ml_datasets, open_ml_datasets_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 100000, num_feats=100, return_capped=True)\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "open_ml_datasets_df = open_ml_datasets_df[open_ml_datasets_df.NumberOfInstances > 10000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\\begin{tabular}{lrrrrrrr}\n",
+      "\\toprule\n",
+      "                                  Name &  \\# Features &  \\# Categorical Features &  \\# Instances &  \\# Classes &  \\# NaNs &  Minority Class Size &    id \\\\\n",
+      "\\midrule\n",
+      "                    KDDCup09\\_appetency &         231 &                      39 &        50000 &          2 & 8024152 &                  890 &  1111 \\\\\n",
+      "                              airlines &           8 &                       5 &       539383 &          2 &       0 &               240264 &  1169 \\\\\n",
+      "                        bank-marketing &          17 &                      10 &        45211 &          2 &       0 &                 5289 &  1461 \\\\\n",
+      "                                 nomao &         119 &                      30 &        34465 &          2 &       0 &                 9844 &  1486 \\\\\n",
+      "                                 adult &          15 &                       9 &        48842 &          2 &    6465 &                11687 &  1590 \\\\\n",
+      "                             covertype &          55 &                      45 &       581012 &          7 &       0 &                 2747 &  1596 \\\\\n",
+      "                           numerai28.6 &          22 &                       1 &        96320 &          2 &       0 &                47662 & 23517 \\\\\n",
+      "                             connect-4 &          43 &                      43 &        67557 &          3 &       0 &                 6449 & 40668 \\\\\n",
+      "jungle\\_chess\\_2pcs\\_raw\\_endgame\\_complete &           7 &                       1 &        44819 &          3 &       0 &                 4335 & 41027 \\\\\n",
+      "                            APSFailure &         171 &                       1 &        76000 &          2 & 1078695 &                 1375 & 41138 \\\\\n",
+      "                                albert &          79 &                      53 &       425240 &          2 & 2734000 &               212620 & 41147 \\\\\n",
+      "                             MiniBooNE &          51 &                       1 &       130064 &          2 &       0 &                36499 & 41150 \\\\\n",
+      "                             guillermo &        4297 &                       1 &        20000 &          2 &       0 &                 8003 & 41159 \\\\\n",
+      "                              riccardo &        4297 &                       1 &        20000 &          2 &       0 &                 5000 & 41161 \\\\\n",
+      "                               volkert &         181 &                       1 &        58310 &         10 &       0 &                 1361 & 41166 \\\\\n",
+      "                                dionis &          61 &                       1 &       416188 &        355 &       0 &                  878 & 41167 \\\\\n",
+      "                                jannis &          55 &                       1 &        83733 &          4 &       0 &                 1687 & 41168 \\\\\n",
+      "                                helena &          28 &                       1 &        65196 &        100 &       0 &                  111 & 41169 \\\\\n",
+      "\\bottomrule\n",
+      "\\end{tabular}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_table = open_ml_datasets_df\n",
+    "print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()\n",
+    "print_table['id'] = print_table.index\n",
+    "print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)\n",
+    "print_table = print_table.rename(columns=renamer)\n",
+    "print(print_table.to_latex(index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Prepare Validation datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "open_cc_datasets, open_cc_datasets_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 2000, num_feats=100, return_capped=True)\n",
+    "\n",
+    "def extend_datasets(datasets, filtering = False):\n",
+    "    extended_datasets = {}\n",
+    "    i = 0\n",
+    "    for d in tqdm(datasets):\n",
+    "        if ((not 'NumberOfFeatures' in datasets[d])\n",
+    "                or (not 'NumberOfClasses' in datasets[d])\n",
+    "                or (not 'NumberOfInstances' in datasets[d])\n",
+    "                # or datasets[d]['NumberOfFeatures'] >= num_feats\n",
+    "                or datasets[d]['NumberOfClasses'] <= 0):\n",
+    "            print(datasets[d])\n",
+    "            continue\n",
+    "        ds = openml.datasets.get_dataset(d, download_data=False)\n",
+    "        if filtering and (datasets[d]['NumberOfInstances'] < 150\n",
+    "                          or datasets[d]['NumberOfInstances'] > 2000\n",
+    "                         or datasets[d]['NumberOfFeatures'] > 100\n",
+    "                         or datasets[d]['NumberOfClasses'] > 10):\n",
+    "            continue\n",
+    "        extended_datasets[d] = datasets[d]\n",
+    "        extended_datasets[d].update(ds.qualities)\n",
+    "    \n",
+    "    return extended_datasets\n",
+    "\n",
+    "# All datasets\n",
+    "openml_list = openml.datasets.list_datasets()\n",
+    "openml_list = pd.DataFrame.from_dict(openml_list, orient=\"index\")\n",
+    "\n",
+    "# Select only classification\n",
+    "openml_list = openml_list[~openml_list['MajorityClassSize'].isna()]\n",
+    "\n",
+    "# Remove duplicated datasets\n",
+    "duplicated = openml_list.duplicated(subset=['MajorityClassSize', 'MaxNominalAttDistinctValues', 'MinorityClassSize',\n",
+    "       'NumberOfClasses', 'NumberOfFeatures', 'NumberOfInstances',\n",
+    "       'NumberOfInstancesWithMissingValues', 'NumberOfMissingValues',\n",
+    "       'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures'], keep='first')\n",
+    "openml_list = openml_list[~duplicated]\n",
+    "\n",
+    "duplicated = openml_list.duplicated(subset=['name'], keep='first')\n",
+    "openml_list = openml_list[~duplicated]\n",
+    "\n",
+    "# Filter out datasets that don't have meta information or Don't fulfill other criteria\n",
+    "openml_list = openml_list.to_dict(orient='index')\n",
+    "openml_list = pd.DataFrame.from_dict(extend_datasets(openml_list, filtering=True), orient=\"index\")\n",
+    "\n",
+    "# Filter out datasets in Open CC\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: x in test_datasets_multiclass_df.name.values)]\n",
+    "openml_list['CFI'] = openml_list.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)\n",
+    "test_datasets_multiclass_df['CFI'] = test_datasets_multiclass_df.apply(lambda x: str(x.NumberOfClasses) + '_' + str(x.NumberOfFeatures) + '_' + str(x.NumberOfInstances), axis = 1)\n",
+    "openml_list = openml_list[~openml_list.CFI.apply(lambda x: x in test_datasets_multiclass_df.CFI.values)]\n",
+    "\n",
+    "# Remove time series and artificial data\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: 'autoUniv' in x)]\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: 'fri_' in x)]\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: 'FOREX' in x)]\n",
+    "\n",
+    "# Remove datasets that overlapped with Open CC closely by name\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: 'ilpd' in x)]\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: 'car' in x)]\n",
+    "openml_list = openml_list[~openml_list.name.apply(lambda x: 'pc1' in x)]\n",
+    "\n",
+    "# Remove datasets that didn't load\n",
+    "openml_list = openml_list[~openml_list.did.apply(lambda x: x in {1065, 40589, 41496, 770, 43097, 43148, 43255, 43595, 43786, 41701})]\n",
+    "\n",
+    "# Remove class skew\n",
+    "openml_list = openml_list[(openml_list.MinorityClassSize / openml_list.MajorityClassSize) > 0.05]\n",
+    "openml_list = openml_list[openml_list.AutoCorrelation != 1]\n",
+    "\n",
+    "# Remove too easy\n",
+    "openml_list = openml_list[openml_list.CfsSubsetEval_DecisionStumpAUC != 1]"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_table = openml_list\n",
+    "print_table = print_table[['name', 'NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].copy()\n",
+    "print_table['id'] = print_table.index\n",
+    "print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']] = print_table[['NumberOfFeatures', 'NumberOfSymbolicFeatures', 'NumberOfInstances', 'NumberOfClasses', 'NumberOfMissingValues', 'MinorityClassSize']].astype(int)\n",
+    "print_table = print_table.rename(columns=renamer)\n",
+    "print(print_table.to_latex(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

TabPFN/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# TabPFN
+## Installation
+```
+git clone git@github.com:automl/TabPFN.git
+cd TabPFN
+conda create -n TabPFN python=3.7
+conda activate TabPFN
+pip install -r requirements.txt
+```
+To run the autogluon baseline please create a separate environment and install autogluon==0.4.0, installation in the same environment as our other baselines is not possible.
+## Usage
+TrainingTuningAndPrediction: Train a TabPFN, Prior Tune and predict using a pretrained model.
+TabularEvaluationVisualization: Run Baselines and load Baseline and TabPFN Results for comparison and plotting.
+PrepareDatasets: Notebook used to inspect Datasets (Not needed to run baselines / TabPFN).
+SytheticGPAblation: Ablation experiments for Gaussian Process fitting with differentiable Hyper Parameters.

TabPFN/SyntheticGPAblation.ipynb ADDED Viewed

	@@ -0,0 +1,392 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "\n",
+    "import torch\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from model_builder import get_model, get_default_spec, save_model, load_model\n",
+    "\n",
+    "from scripts.model_configs import *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Setting params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = 'cuda'\n",
+    "base_path = os.path.join('.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_function(config_sample, i, add_name=''):\n",
+    "    start_time = time.time()\n",
+    "    N_epochs_to_save = 50\n",
+    "    \n",
+    "    def save_callback(model, epoch):\n",
+    "        if not hasattr(model, 'last_saved_epoch'):\n",
+    "            model.last_saved_epoch = 0\n",
+    "        if ((time.time() - start_time) / (maximum_runtime * 60 / N_epochs_to_save)) > model.last_saved_epoch:\n",
+    "            print('Saving model..')\n",
+    "            config_sample['epoch_in_training'] = epoch\n",
+    "            save_model(model, base_path, f'models_diff/prior_diff_real_checkpoint{add_name}_n_{i}_epoch_{model.last_saved_epoch}.cpkt',\n",
+    "                           config_sample)\n",
+    "            model.last_saved_epoch = model.last_saved_epoch + 1 # TODO: Rename to checkpoint\n",
+    "    \n",
+    "    model = get_model(config_sample\n",
+    "                      , device\n",
+    "                      , should_train=True\n",
+    "                      , verbose=1\n",
+    "                      , epoch_callback = save_callback)\n",
+    "    \n",
+    "    return"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "heading_collapsed": true,
+    "tags": []
+   },
+   "source": [
+    "# Check synthetic data fitting"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Workflow functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "hidden": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def generate_test_data(test_gp_params):\n",
+    "    # Generate test data\n",
+    "    config = {**test_gp_params}\n",
+    "\n",
+    "    config['verbose'] = False\n",
+    "    config['differentiable'] = False\n",
+    "    #config['bptt'] = config['bptt_in_training']\n",
+    "\n",
+    "    model_test_data = get_model(config, device, should_train=False, verbose=True)\n",
+    "    (hp_embedding, data, targets_), targets = next(iter(model_test_data[3]))\n",
+    "    (hp_embedding, data, targets_), targets = (hp_embedding, data.to(device), targets_.to(device)), targets.to(device)\n",
+    "    \n",
+    "    return (hp_embedding, data, targets_), targets\n",
+    "\n",
+    "def evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size):\n",
+    "    losses, hparams = [], []\n",
+    "    for l in np.arange(-1.74, 1.74, plot_step_size):\n",
+    "        hparam = [*hparam_true]\n",
+    "        hparam[vary_hparam_ind] = l\n",
+    "        hp_embedding_used = torch.tensor(hparam).to(device).float()\n",
+    "        with torch.inference_mode():\n",
+    "            outputs = torch.sigmoid(model[2]((hp_embedding_used.repeat(data.shape[1], 1), data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)\n",
+    "        \n",
+    "        loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten()).detach().cpu()\n",
+    "        losses += [loss]\n",
+    "        hparam_real = [diff_hparams_f[i][1](hp) for i, hp in enumerate(hparam)]\n",
+    "        hparams += [hparam_real]\n",
+    "        \n",
+    "        print(loss, hparam_real, hparam, outputs.shape)\n",
+    "    return np.array(losses), np.array(hparams)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def differentiable_hparam_tuning_workflow(config_sample, hparam_label, batch_size=4, N_grad_steps=50, plot_step_size=0.1):\n",
+    "    test_gp_params = {\n",
+    "        \"lengthscale\": 1.0,\n",
+    "        #\"lengthscale_mean\": true_lengthscale,\n",
+    "        #\"lengthscale_std\": 0.5,\n",
+    "        \"noise\": 0.2,\n",
+    "        \"outputscale\": 1.0,\n",
+    "        'batch_size': batch_size\n",
+    "    }\n",
+    "    config_sample.update(test_gp_params)\n",
+    "    (hp_embedding, data, targets_), targets = generate_test_data(config_sample)\n",
+    "    hparam_true = [diff_hparams_f[i][0](test_gp_params[hp]) for i, hp in enumerate(diff_hparams_keys)]\n",
+    "    #hparam_true = [test_gp_params[hp] for i, hp in enumerate(diff_hparams_keys)]\n",
+    "\n",
+    "    for vary_hparam_ind, vary_hparam_name in hparam_label:\n",
+    "        print(vary_hparam_name)\n",
+    "\n",
+    "        losses, hparams = evaluate_hp_range(model, hparam_true, vary_hparam_ind, data, targets, eval_pos, plot_step_size=plot_step_size)\n",
+    "\n",
+    "        # TODO: Make only one parameter diffable\n",
+    "        hparam = torch.tensor([*hparam_true]).to(device).float()\n",
+    "        hparam[vary_hparam_ind] = hparam[vary_hparam_ind] + 0.1 #random.random() * 2 - 1\n",
+    "        hparam = torch.nn.Parameter(hparam, requires_grad=True)\n",
+    "        hparam_grad_mask = torch.zeros_like(hparam)\n",
+    "        hparam_grad_mask[vary_hparam_ind] = 1\n",
+    "\n",
+    "        optimizer = torch.optim.Adam([hparam], lr=0.1)\n",
+    "    \n",
+    "        for t in range(N_grad_steps):\n",
+    "            style = hparam.repeat(data.shape[1], 1)\n",
+    "            outputs = torch.sigmoid(model[2]((style, data, targets.float()), single_eval_pos=eval_pos)).squeeze(-1)\n",
+    "            loss = torch.nn.BCELoss()(outputs.flatten(), targets[eval_pos:].flatten())\n",
+    "            optimizer.zero_grad()\n",
+    "            loss.backward()\n",
+    "            with torch.no_grad():\n",
+    "                hparam.grad *= hparam_grad_mask\n",
+    "            optimizer.step()\n",
+    "            print('loss:', loss, 'hparams', diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind]), 'true', diff_hparams_f[vary_hparam_ind][1](hparam_true[vary_hparam_ind]))\n",
+    "        inferred_param = diff_hparams_f[vary_hparam_ind][1](hparam[vary_hparam_ind].cpu().detach().numpy())\n",
+    "        return hparams, losses, inferred_param, vary_hparam_ind, hparam_true\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Fitting a PFN with HP-Diffable GP Prior"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "hidden": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "num_features = 5\n",
+    "bptt = 200\n",
+    "eval_positions = [100]\n",
+    "\n",
+    "config_general = get_general_config(num_features, bptt, eval_positions)\n",
+    "config_flexible_categorical = get_flexible_categorical_config(num_features)\n",
+    "\n",
+    "config_gp = {'noise': 0.2,  \"lengthscale\": 1.0, \"outputscale\": 1.0}\n",
+    "config_diff_gp = {'differentiable_hyperparameters': {\n",
+    "        'outputscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},\n",
+    "        'lengthscale': {'distribution': 'uniform', 'min': 0., 'max': 10.0},\n",
+    "        'noise': {'distribution': 'uniform', 'min': 0.0000001, 'max': 0.5},\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "config = {**config_general, **config_flexible_categorical, **config_diff_gp, **config_gp}\n",
+    "\n",
+    "config['prior_type'], config['differentiable'], config['flexible'] = 'gp', True, True\n",
+    "config['num_features'], config['num_features_used'] = num_features, num_features\n",
+    "config['epochs'], config['num_steps'], config['verbose'] = 500, 100, False\n",
+    "config[\"lr\"] = 0.00001\n",
+    "config[\"dropout\"] = 0\n",
+    "config[\"emsize\"] = 512\n",
+    "config[\"batch_size\"] = 128\n",
+    "config[\"aggregate_k_gradients\"] = 1\n",
+    "config['set_value_to_nan'] = 0.0\n",
+    "config['output_multiclass_ordered_p'] = 1.0\n",
+    "config['categorical_feature_p'] = 0.0\n",
+    "config['nan_prob_a_reason'] = 0.0\n",
+    "config['nan_prob_no_reason'] = 0.0\n",
+    "config['nan_prob_unknown_reason'] = 0.0\n",
+    "config[\"nlayers\"] = 8\n",
+    "\n",
+    "# TODO: This should not be sampled, but be one config\n",
+    "# TODO: This uses old hyperparam sampler throws error\n",
+    "config_sample = evaluate_hypers(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "hidden": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using style prior: True\n",
+      "Using cpu:0 device\n",
+      "Not using distributed\n",
+      "DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 128, 'seq_len': 200, 'seq_len_maximum': 200, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 128, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 200, 'eval_positions': None, 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': 5, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': 0.2, 'lengthscale': 1.0, 'outputscale': 1.0, 'prior_type': 'gp', 'differentiable': True, 'flexible': True, 'aggregate_k_gradients': 1, 'output_multiclass_ordered_p': 1.0, 'recompute_attn': False}, 'num_outputs': 1, 'dynamic_batch_size': 2, 'get_batch': <function get_model.<locals>.make_get_batch.<locals>.<lambda> at 0x7f39ad8dcf80>, 'differentiable_hyperparameters': {'outputscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'lengthscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': {'distribution': 'uniform', 'min': 1e-07, 'max': 0.5}}}, 'num_features': 5, 'num_outputs': 1}\n",
+      "Using a Transformer with 17.35 M parameters\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = 'cuda'\n",
+    "train_function(config_sample, 0, add_name='gp_experiments_diff_with_noise_no_meta_new')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Evaluating a PFN (with pretrained model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "hidden": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using style prior: True\n",
+      "Using cpu:0 device\n",
+      "Not using distributed\n",
+      "DataLoader.__dict__ {'num_steps': 100, 'fuse_x_y': False, 'get_batch_kwargs': {'batch_size': 1, 'seq_len': 10, 'seq_len_maximum': 10, 'device': 'cpu:0', 'num_features': 5, 'hyperparameters': {'lr': 1e-05, 'dropout': 0, 'emsize': 512, 'batch_size': 1, 'nlayers': 8, 'num_features': 5, 'nhead': 4, 'nhid_factor': 2, 'bptt': 10, 'eval_positions': [190], 'seq_len_used': 200, 'sampling': 'normal', 'epochs': 500, 'num_steps': 100, 'verbose': False, 'pre_sample_causes': True, 'mix_activations': False, 'nan_prob_unknown_reason_reason_prior': 1.0, 'output_multiclass_ordered_p': 1.0, 'categorical_feature_p': 0.0, 'nan_prob_no_reason': 0.0, 'nan_prob_unknown_reason': 0.0, 'nan_prob_a_reason': 0.0, 'max_num_classes': 2, 'num_classes': 2, 'noise_type': 'Gaussian', 'balanced': True, 'multiclass_type': 'rank', 'normalize_to_ranking': False, 'set_value_to_nan': 0.0, 'normalize_by_used_features': True, 'num_features_used': <function load_model.<locals>.<lambda> at 0x7f39ad8534d0>, 'differentiable_hyperparameters': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': 0.03, 'lengthscale': 1.0, 'outputscale': 1.0, 'prior_type': 'gp', 'differentiable': True, 'flexible': True, 'aggregate_k_gradients': 1, 'recompute_attn': False, 'bptt_extra_samples': None, 'epoch_in_training': 0.998, 'categorical_features_sampler': <function load_model.<locals>.<lambda> at 0x7f39ad853680>, 'num_features_used_in_training': 5, 'num_classes_in_training': 2, 'batch_size_in_training': 128, 'bptt_in_training': 200, 'bptt_extra_samples_in_training': None}, 'num_outputs': 1, 'dynamic_batch_size': 2, 'get_batch': <function get_model.<locals>.make_get_batch.<locals>.<lambda> at 0x7f39ad81ab90>, 'differentiable_hyperparameters': {'outputscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'lengthscale': {'distribution': 'uniform', 'min': 0.0, 'max': 10.0}, 'noise': {'distribution': 'uniform', 'min': 1e-07, 'max': 0.5}}}, 'num_features': 5, 'num_outputs': 1}\n",
+      "Using a Transformer with 17.35 M parameters\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = 'cpu'\n",
+    "model, c = load_model(base_path, f'models_diff/gp_ablation_model.cpkt', device, eval_positions, verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from priors.differentiable_prior import DifferentiableHyperparameterList\n",
+    "diff_list = DifferentiableHyperparameterList(c['differentiable_hyperparameters'], 512, device)\n",
+    "diff_hparams_keys, diff_hparams_f = diff_list.get_hyperparameter_info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model[2].eval()\n",
+    "eval_pos = 100\n",
+    "\n",
+    "hparam_label = [(1, 'outputscale')]\n",
+    "hparam_label = [(0, 'lengthscale')]\n",
+    "hparam_label = [(2, 'noise')]\n",
+    "hparam_labels = [[(1, 'outputscale')], [(2, 'noise')], [(0, 'lengthscale')]]\n",
+    "#hparam_labels = [[(2, 'noise')]]\n",
+    "\n",
+    "hparams, losses, inferred_param, vary_hparam_ind, hparam_true = {}, {}, {}, {}, {}\n",
+    "\n",
+    "for hparam_label in hparam_labels:\n",
+    "    (hparams[hparam_label[0][1]], losses[hparam_label[0][1]], inferred_param[hparam_label[0][1]], vary_hparam_ind[hparam_label[0][1]], \n",
+    "     hparam_true[hparam_label[0][1]]) = differentiable_hparam_tuning_workflow(config_sample, \n",
+    "                                                                                                          hparam_label=hparam_label, \n",
+    "                                                                                                          batch_size=256, \n",
+    "                                                                                                          N_grad_steps=50,\n",
+    "                                                                             plot_step_size=0.05)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label = 'lengthscale'\n",
+    "\n",
+    "#import tikzplotlib\n",
+    "\n",
+    "inferred = losses[label]\n",
+    "\n",
+    "plt.plot(hparams[label][:, vary_hparam_ind[label]], losses[label])\n",
+    "true = diff_hparams_f[vary_hparam_ind[label]][1](hparam_true[label][vary_hparam_ind[label]])\n",
+    "plt.axvline(x=inferred_param[label], linestyle='solid', color='red')\n",
+    "plt.axvline(x=true, linestyle='dashed')\n",
+    "\n",
+    "plt.ylabel('Cross entropy Loss')\n",
+    "plt.xlabel(label)\n",
+    "\n",
+    "#tikzplotlib.save(f'diff_inferred_params_{label}.tex', axis_height='5.2cm', axis_width='5.2cm', strict=True)\n",
+    "\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

TabPFN/TabPFNPredictionOnly.ipynb ADDED Viewed

	@@ -0,0 +1,253 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to use TabPFN for tabular prediction with a scikit learn wrapper.\n",
+    "\n",
+    "classifier = TabPFNClassifier(device='cpu')\n",
+    "classifier.fit(train_xs, train_ys)\n",
+    "prediction_ = classifier.predict(test_xs)\n",
+    "\n",
+    "The fit function does not perform any computations, but only saves the training data. Computations are only done at inference time, when calling predict.\n",
+    "Note that the presaved models were trained for up to 100 features, 10 classes and 1000 samples. While the model does not have a hard bound on the number of samples, the features and classes are restricted and larger sizes lead to an error."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import random\n",
+    "\n",
+    "from model_builder import get_model, get_default_spec, save_model, load_model\n",
+    "from scripts.transformer_prediction_interface import transformer_predict, get_params_from_config, TabPFNClassifier\n",
+    "\n",
+    "from datasets import load_openml_list, open_cc_dids, open_cc_valid_dids\n",
+    "\n",
+    "from scripts import tabular_metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_path = '.'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Load datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "max_samples = 10000\n",
+    "bptt = 10000\n",
+    "\n",
+    "cc_test_datasets_multiclass, cc_test_datasets_multiclass_df = load_openml_list(open_cc_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = max_samples, num_feats=100, return_capped=True)\n",
+    "cc_valid_datasets_multiclass, cc_valid_datasets_multiclass_df = load_openml_list(open_cc_valid_dids, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = max_samples, num_feats=100, return_capped=True)\n",
+    "\n",
+    "# Loading longer OpenML Datasets for generalization experiments (optional)\n",
+    "# test_datasets_multiclass, test_datasets_multiclass_df = load_openml_list(test_dids_classification, multiclass=True, shuffled=True, filter_for_nan=False, max_samples = 10000, num_feats=100, return_capped=True)\n",
+    "\n",
+    "random.seed(0)\n",
+    "random.shuffle(cc_valid_datasets_multiclass)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import get_openml_classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = openml.datasets.get_dataset(31)\n",
+    "X, y, categorical_indicator, attribute_names = dataset.get_data(\n",
+    "        dataset_format=\"array\", target=dataset.default_target_attribute\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_datasets(selector, task_type, suite='cc'):\n",
+    "    if task_type == 'binary':\n",
+    "        ds = valid_datasets_binary if selector == 'valid' else test_datasets_binary\n",
+    "    else:\n",
+    "        if suite == 'openml':\n",
+    "            ds = valid_datasets_multiclass if selector == 'valid' else test_datasets_multiclass\n",
+    "        elif suite == 'cc':\n",
+    "            ds = cc_valid_datasets_multiclass if selector == 'valid' else cc_test_datasets_multiclass\n",
+    "        else:\n",
+    "            raise Exception(\"Unknown suite\")\n",
+    "    return ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_string, longer, task_type = '', 1, 'multiclass'\n",
+    "eval_positions = [1000]\n",
+    "bptt = 2000\n",
+    "    \n",
+    "test_datasets, valid_datasets = get_datasets('test', task_type, suite='cc'), get_datasets('valid', task_type, suite='cc')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "### Select a dataset for prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "[(i, test_datasets[i][0]) for i in range(len(test_datasets))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_dataset_index = 4 # Index of the dataset to predict\n",
+    "ds = test_datasets[evaluation_dataset_index]\n",
+    "print(f'Evaluation dataset name: {ds[0]} shape {ds[1].shape}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xs, ys = ds[1].clone(), ds[2].clone()\n",
+    "eval_position = xs.shape[0] // 2\n",
+    "train_xs, train_ys = xs[0:eval_position], ys[0:eval_position]\n",
+    "test_xs, test_ys = xs[eval_position:], ys[eval_position:]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Predict using a Fitted and Tuned Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier = TabPFNClassifier(device='cpu')\n",
+    "classifier.fit(train_xs, train_ys)\n",
+    "prediction_ = classifier.predict_proba(test_xs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "roc, ce = tabular_metrics.auc_metric(test_ys, prediction_), tabular_metrics.cross_entropy(test_ys, prediction_)\n",
+    "'AUC', float(roc), 'Cross Entropy', float(ce)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

TabPFN/TabularEvaluationVisualization.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

TabPFN/TrainingTuningAndPrediction.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

TabPFN/__pycache__/encoders.cpython-39.pyc ADDED Viewed

Binary file (9.42 kB). View file

TabPFN/__pycache__/layer.cpython-39.pyc ADDED Viewed

Binary file (4.56 kB). View file

TabPFN/__pycache__/model_builder.cpython-39.pyc ADDED Viewed

Binary file (9.88 kB). View file

TabPFN/__pycache__/notebook_utils.cpython-39.pyc ADDED Viewed

Binary file (1.53 kB). View file

TabPFN/__pycache__/positional_encodings.cpython-39.pyc ADDED Viewed

Binary file (2.91 kB). View file

TabPFN/__pycache__/train.cpython-39.pyc ADDED Viewed

Binary file (12.2 kB). View file

TabPFN/__pycache__/transformer.cpython-39.pyc ADDED Viewed

Binary file (8.01 kB). View file

TabPFN/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (10.2 kB). View file

TabPFN/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import pandas as pd
+import torch
+import numpy as np
+import openml
+def get_openml_classification(did, max_samples, multiclass=True, shuffled=True):
+    dataset = openml.datasets.get_dataset(did)
+    X, y, categorical_indicator, attribute_names = dataset.get_data(
+        dataset_format="array", target=dataset.default_target_attribute
+    )
+    if not multiclass:
+        X = X[y < 2]
+        y = y[y < 2]
+    if multiclass and not shuffled:
+        raise NotImplementedError("This combination of multiclass and shuffling isn't implemented")
+    if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
+        print('Not a NP Array, skipping')
+        return None, None, None, None
+    if not shuffled:
+        sort = np.argsort(y) if y.mean() < 0.5 else np.argsort(-y)
+        pos = int(y.sum()) if y.mean() < 0.5 else int((1 - y).sum())
+        X, y = X[sort][-pos * 2:], y[sort][-pos * 2:]
+        y = torch.tensor(y).reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).float()
+        X = torch.tensor(X).reshape(2, -1, X.shape[1]).transpose(0, 1).reshape(-1, X.shape[1]).flip([0]).float()
+    else:
+        order = np.arange(y.shape[0])
+        np.random.seed(13)
+        np.random.shuffle(order)
+        X, y = torch.tensor(X[order]), torch.tensor(y[order])
+    if max_samples:
+        X, y = X[:max_samples], y[:max_samples]
+    return X, y, list(np.where(categorical_indicator)[0]), attribute_names
+def load_openml_list(dids, filter_for_nan=False
+                     , num_feats=100
+                     , min_samples = 100
+                     , max_samples=400
+                     , multiclass=True
+                     , max_num_classes=10
+                     , shuffled=True
+                     , return_capped = False):
+    datasets = []
+    openml_list = openml.datasets.list_datasets(dids)
+    print(f'Number of datasets: {len(openml_list)}')
+    datalist = pd.DataFrame.from_dict(openml_list, orient="index")
+    if filter_for_nan:
+        datalist = datalist[datalist['NumberOfInstancesWithMissingValues'] == 0]
+        print(f'Number of datasets after Nan and feature number filtering: {len(datalist)}')
+    for ds in datalist.index:
+        modifications = {'samples_capped': False, 'classes_capped': False, 'feats_capped': False}
+        entry = datalist.loc[ds]
+        print('Loading', entry['name'], entry.did, '..')
+        if entry['NumberOfClasses'] == 0.0:
+            raise Exception("Regression not supported")
+            #X, y, categorical_feats, attribute_names = get_openml_regression(int(entry.did), max_samples)
+        else:
+            X, y, categorical_feats, attribute_names = get_openml_classification(int(entry.did), max_samples
+                                                                , multiclass=multiclass, shuffled=shuffled)
+        if X is None:
+            continue
+        if X.shape[1] > num_feats:
+            if return_capped:
+                X = X[:, 0:num_feats]
+                categorical_feats = [c for c in categorical_feats if c < num_feats]
+                modifications['feats_capped'] = True
+            else:
+                print('Too many features')
+                continue
+        if X.shape[0] == max_samples:
+            modifications['samples_capped'] = True
+        if X.shape[0] < min_samples:
+            print(f'Too few samples left')
+            continue
+        if len(np.unique(y)) > max_num_classes:
+            if return_capped:
+                X = X[y < np.unique(y)[10]]
+                y = y[y < np.unique(y)[10]]
+                modifications['classes_capped'] = True
+            else:
+                print(f'Too many classes')
+                continue
+        datasets += [[entry['name'], X, y, categorical_feats, attribute_names, modifications]]
+    return datasets, datalist
+# Classification
+valid_dids_classification = [13, 59, 4, 15, 40710, 43, 1498]
+test_dids_classification = [973, 1596, 40981, 1468, 40984, 40975, 41163, 41147, 1111, 41164, 1169, 1486, 41143, 1461, 41167, 40668, 41146, 41169, 41027, 23517, 41165, 41161, 41159, 41138, 1590, 41166, 1464, 41168, 41150, 1489, 41142, 3, 12, 31, 54, 1067]
+valid_large_classification = [  943, 23512,    49,   838,  1131,   767,  1142,   748,  1112,
+        1541,   384,   912,  1503,   796,    20,    30,   903,  4541,
+         961,   805,  1000,  4135,  1442,   816,  1130,   906,  1511,
+         184,   181,   137,  1452,  1481,   949,   449,    50,   913,
+        1071,   831,   843,     9,   896,  1532,   311,    39,   451,
+         463,   382,   778,   474,   737,  1162,  1538,   820,   188,
+         452,  1156,    37,   957,   911,  1508,  1054,   745,  1220,
+         763,   900,    25,   387,    38,   757,  1507,   396,  4153,
+         806,   779,   746,  1037,   871,   717,  1480,  1010,  1016,
+         981,  1547,  1002,  1126,  1459,   846,   837,  1042,   273,
+        1524,   375,  1018,  1531,  1458,  6332,  1546,  1129,   679,
+         389]
+open_cc_dids = [11,
+ 14,
+ 15,
+ 16,
+ 18,
+ 22,
+ 23,
+ 29,
+ 31,
+ 37,
+ 50,
+ 54,
+ 188,
+ 458,
+ 469,
+ 1049,
+ 1050,
+ 1063,
+ 1068,
+ 1510,
+ 1494,
+ 1480,
+ 1462,
+ 1464,
+ 6332,
+ 23381,
+ 40966,
+ 40982,
+ 40994,
+ 40975]
+# Filtered by N_samples < 2000, N feats < 100, N classes < 10
+open_cc_valid_dids = [13,25,35,40,41,43,48,49,51,53,55,56,59,61,187,285,329,333,334,335,336,337,338,377,446,450,451,452,460,463,464,466,470,475,481,679,694,717,721,724,733,738,745,747,748,750,753,756,757,764,765,767,774,778,786,788,795,796,798,801,802,810,811,814,820,825,826,827,831,839,840,841,844,852,853,854,860,880,886,895,900,906,907,908,909,915,925,930,931,934,939,940,941,949,966,968,984,987,996,1048,1054,1071,1073,1100,1115,1412,1442,1443,1444,1446,1447,1448,1451,1453,1488,1490,1495,1498,1499,1506,1508,1511,1512,1520,1523,4153,23499,40496,40646,40663,40669,40680,40682,40686,40690,40693,40705,40706,40710,40711,40981,41430,41538,41919,41976,42172,42261,42544,42585,42638]

TabPFN/datasets/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (4.69 kB). View file

TabPFN/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def normalize_data(eval_xs):
+    mean = eval_xs.mean(0)
+    std = eval_xs.std(0) + .000001
+    eval_xs = (eval_xs - mean) / std
+    return eval_xs

TabPFN/decoders.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from torch import nn
+import random
+class ScaledDecoder(nn.Module):
+    def __init__(self, ninp, nhid, nout):
+        super().__init__()
+        self.linear = nn.Linear(ninp, nhid)
+        self.linear1 = nn.Linear(nhid, nout)
+        self.linear2 = nn.Linear(nhid, 10)
+    def forward(self, x):
+        #return torch.cat([self.linear1(x), self.linear2(x)], -1)
+        x = self.linear(x)
+        x = nn.GELU()(x)
+        temps = self.linear2(x).softmax(-1) @ torch.tensor([1.,1.4,1.7,2.,5.,10.,20.,40.,80.,160.], device=x.device)
+        if random.random() > .99:
+            print(temps.shape,temps[:,:2])
+        return self.linear1(x) / temps.unsqueeze(-1)
+class FixedScaledDecoder(nn.Module):
+    def __init__(self, ninp, nhid, nout):
+        super().__init__()
+        self.mapper = nn.Sequential(nn.Linear(ninp, nhid), nn.GELU(), nn.Linear(nhid, nout))
+        self.T = nn.Parameter(torch.ones(10000)/10000)
+    def forward(self, x):
+        return self.mapper(x)/self.T.sum()

TabPFN/differentiable_pfn_evaluation.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import torch
+import numpy as np
+import time
+import pickle
+from scripts import  tabular_metrics
+from scripts.tabular_metrics import calculate_score_per_method
+from scripts.tabular_evaluation import evaluate
+from priors.differentiable_prior import draw_random_style
+from tqdm import tqdm
+import random
+from scripts.transformer_prediction_interface import get_params_from_config, load_model_workflow
+"""
+===============================
+PUBLIC FUNCTIONS FOR EVALUATION
+===============================
+"""
+def eval_model_range(i_range, *args, **kwargs):
+    for i in i_range:
+        eval_model(i, *args, **kwargs)
+def eval_model(i, e, valid_datasets, test_datasets, train_datasets, eval_positions_valid, eval_positions_test,
+               bptt_valid,
+               bptt_test, add_name, base_path, device='cpu', eval_addition='', **extra_tuning_args):
+    """
+    Differentiable model evaliation workflow. Evaluates and saves results to disk.
+    :param i:
+    :param e:
+    :param valid_datasets:
+    :param test_datasets:
+    :param train_datasets:
+    :param eval_positions_valid:
+    :param eval_positions_test:
+    :param bptt_valid:
+    :param bptt_test:
+    :param add_name:
+    :param base_path:
+    :param device:
+    :param eval_addition:
+    :param extra_tuning_args:
+    :return:
+    """
+    model, c, results_file = load_model_workflow(i, e, add_name, base_path, device, eval_addition)
+    params = {'bptt': bptt_valid
+        , 'bptt_final': bptt_test
+        , 'eval_positions': eval_positions_valid
+        , 'eval_positions_test': eval_positions_test
+        , 'valid_datasets': valid_datasets
+        , 'test_datasets': test_datasets
+        , 'train_datasets': train_datasets
+        , 'verbose': True
+        , 'device': device
+              }
+    params.update(get_params_from_config(c))
+    start = time.time()
+    metrics, metrics_valid, style, temperature, optimization_route = evaluate_differentiable_model(model, **params,
+                                                                                                   **extra_tuning_args)
+    print('Evaluation time: ', time.time() - start)
+    print(results_file)
+    r = [c.copy(), metrics, metrics_valid, style.to('cpu'), temperature.to('cpu'), optimization_route]
+    with open(results_file, 'wb') as output:
+        del r[0]['num_features_used']
+        del r[0]['categorical_features_sampler']
+        pickle.dump(r, output)
+    _, _, _, style, temperature, _ = r
+    return r, model
+"""
+===============================
+INTERNAL HELPER FUNCTIONS
+===============================
+"""
+def evaluate_differentiable_model(model
+                                  , valid_datasets
+                                  , test_datasets
+                                  , train_datasets
+                                  , N_draws=100
+                                  , N_grad_steps=10
+                                  , eval_positions=None
+                                  , eval_positions_test=None
+                                  , bptt=100
+                                  , bptt_final=200
+                                  , style=None
+                                  , n_parallel_configurations=1
+                                  , device='cpu'
+                                  , selection_metric='auc'
+                                  , final_splits=[1, 2, 3, 4, 5]
+                                  , N_ensemble_configurations_list=[1, 5, 10, 20, 50, 100]
+                                  , **kwargs):
+    """
+    Evaluation function for diffable model evaluation. Returns a list of results.
+    :param model:
+    :param valid_datasets:
+    :param test_datasets:
+    :param train_datasets:
+    :param N_draws:
+    :param N_grad_steps:
+    :param eval_positions:
+    :param eval_positions_test:
+    :param bptt:
+    :param bptt_final:
+    :param style:
+    :param n_parallel_configurations:
+    :param device:
+    :param selection_metric:
+    :param final_splits:
+    :param N_ensemble_configurations_list:
+    :param kwargs:
+    :return:
+    """
+    torch.manual_seed(0)
+    np.random.seed(0)
+    random.seed(0)
+    diffable_metric = tabular_metrics.cross_entropy
+    evaluation_metric = tabular_metrics.auc_metric
+    if selection_metric in ('auc', 'roc'):
+        selection_metric_min_max = 'max'
+        selection_metric = tabular_metrics.auc_metric
+        evaluation_metric = selection_metric
+    elif selection_metric in ('ce', 'selection_metric'):
+        selection_metric_min_max = 'min'
+        selection_metric = tabular_metrics.cross_entropy
+        evaluation_metric = selection_metric
+    print('Diffable metric', diffable_metric, ' Selection metric', selection_metric, ' Evaluation metric',
+          evaluation_metric)
+    print('N PARALLEL CONFIGURATIONS', n_parallel_configurations)
+    print('eval_positions', eval_positions)
+    def evaluate_valid(style, softmax_temperature, results, results_tracked):
+        result_valid = eval_step(valid_datasets, style, softmax_temperature=softmax_temperature,
+                                 return_tensor=False, inference_mode=True, selection_metric=selection_metric,
+                                 evaluation_metric=evaluation_metric, eval_positions=eval_positions, bptt=bptt, model=model[2])
+        result_valid = [float(result_valid[f'mean_select_at_{pos}']) for pos in eval_positions]
+        results += [result_valid]
+        results_tracked += [np.nanmean(result_valid)]
+    model[2].to(device)
+    model[2].eval()
+    results_on_valid, results_on_valid_tracked = [], []
+    best_style, best_softmax_temperature = style, torch.cat(
+        [torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)], 0)
+    optimization_routes = []
+    best_style = torch.cat([draw_random_style(model[3], device).detach() for n in range(0, n_parallel_configurations)],
+                      0)
+    best_softmax_temperature = torch.cat([torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)],
+                                    0)
+    for _ in tqdm(range(0, N_draws), desc='Iterate over Optimization initializations'): # Evaluates N hparam draws
+        style = torch.cat([draw_random_style(model[3], device).detach() for n in range(0, n_parallel_configurations)],
+                          0)
+        softmax_temperature = torch.cat([torch.tensor([0.0]).to(device) for n in range(0, n_parallel_configurations)],
+                                        0)
+        evaluate_valid(style, softmax_temperature, results_on_valid, results_on_valid_tracked)
+        print(f'Draw --> Valid Selection metric: {results_on_valid[-1]}')
+        if N_grad_steps > 0:
+            gradient_optimize_result = gradient_optimize_style(model, style, N_grad_steps
+                                                               , softmax_temperature=softmax_temperature
+                                                               , model=model[2]
+                                                               , train_datasets=train_datasets
+                                                               , valid_datasets=valid_datasets
+                                                               , selection_metric_min_max=selection_metric_min_max
+                                                               , **kwargs)
+            optimization_routes += [gradient_optimize_result['optimization_route']]
+            evaluate_valid(gradient_optimize_result['best_style']
+                                          , gradient_optimize_result['best_temperature']
+                                          , results_on_valid, results_on_valid_tracked)
+            print(f'After diff --> Valid Selection metric: {results_on_valid[-1]}')
+        if selection_metric_min_max == 'min':
+            is_best = (results_on_valid_tracked[-1] <= min(results_on_valid_tracked))
+        else:
+            is_best = (results_on_valid_tracked[-1] >= max(results_on_valid_tracked))
+        if is_best or best_style is None:
+            best_style = gradient_optimize_result['best_style'].clone()
+            best_softmax_temperature = gradient_optimize_result['best_temperature'].clone()
+    torch.cuda.empty_cache()
+    def final_evaluation():
+        print('Running eval dataset with final params (no gradients)..')
+        print(best_style, best_softmax_temperature)
+        result_test = []
+        for N_ensemble_configurations in N_ensemble_configurations_list:
+            print(f'Running with {N_ensemble_configurations} ensemble_configurations')
+            kwargs['N_ensemble_configurations'] = N_ensemble_configurations
+            splits = []
+            for split in final_splits:
+                splits += [eval_step(test_datasets, best_style, softmax_temperature=best_softmax_temperature
+                                     , return_tensor=False, eval_positions=eval_positions_test,
+                                     bptt=bptt_final, inference_mode=True, split_number=split, model=model[2]
+                                     , selection_metric=selection_metric, evaluation_metric=evaluation_metric)]
+            result_test += [splits]
+        print('Running valid dataset with final params (no gradients)..')
+        result_valid = eval_step(valid_datasets, best_style, softmax_temperature=best_softmax_temperature
+                                 , return_tensor=False, eval_positions=eval_positions_test,
+                                 bptt=bptt_final, inference_mode=True, model=model[2]
+                                 , selection_metric=selection_metric, evaluation_metric=evaluation_metric)
+        return result_test, result_valid
+    result_test, result_valid = final_evaluation()
+    return result_test, result_valid, best_style, best_softmax_temperature, optimization_routes
+def eval_step(ds, used_style, selection_metric, evaluation_metric, eval_positions, return_tensor=True, **kwargs):
+    def step():
+        return evaluate(datasets=ds,
+                        method='transformer'
+                        , overwrite=True
+                        , style=used_style
+                        , eval_positions=eval_positions
+                        , metric_used=selection_metric
+                        , save=False
+                        , path_interfix=None
+                        , base_path=None
+                        , verbose=True
+                        , **kwargs)
+    if return_tensor:
+        r = step()
+    else:
+        with torch.no_grad():
+            r = step()
+    calculate_score_per_method(selection_metric, 'select', r, ds, eval_positions, aggregator='mean')
+    calculate_score_per_method(evaluation_metric, 'eval', r, ds, eval_positions, aggregator='mean')
+    return r
+def gradient_optimize_style(model, init_style, steps, softmax_temperature, train_datasets, valid_datasets, learning_rate=0.03, optimize_all=False,
+                            limit_style=True, N_datasets_sampled=90, optimize_softmax_temperature=True, selection_metric_min_max='max', **kwargs):
+    """
+    Uses gradient based methods to optimize 'style' on the 'train_datasets' and uses stopping with 'valid_datasets'.
+    :param model:
+    :param init_style:
+    :param steps:
+    :param learning_rate:
+    :param softmax_temperature:
+    :param train_datasets:
+    :param valid_datasets:
+    :param optimize_all:
+    :param limit_style:
+    :param N_datasets_sampled:
+    :param optimize_softmax_temperature:
+    :param selection_metric_min_max:
+    :param kwargs:
+    :return:
+    """
+    grad_style = torch.nn.Parameter(init_style.detach(), requires_grad=True)
+    best_style, best_temperature, best_selection_metric, best_diffable_metric = grad_style.detach(), softmax_temperature.detach(), None, None
+    softmax_temperature = torch.nn.Parameter(softmax_temperature.detach(), requires_grad=optimize_softmax_temperature)
+    variables_to_optimize = model[2].parameters() if optimize_all else [grad_style, softmax_temperature]
+    optimizer = torch.optim.Adam(variables_to_optimize, lr=learning_rate)
+    optimization_route_selection, optimization_route_diffable = [], []
+    optimization_route_selection_valid, optimization_route_diffable_valid = [], []
+    def eval_opt(ds, return_tensor=True, inference_mode=False):
+        result = eval_step(ds, grad_style, softmax_temperature=softmax_temperature, return_tensor=return_tensor
+                           , inference_mode=inference_mode, model=model[2], **kwargs)
+        diffable_metric = result['mean_metric']
+        selection_metric = result['mean_select']
+        return diffable_metric, selection_metric
+    def eval_all_datasets(datasets, propagate=True):
+        selection_metrics_this_step, diffable_metrics_this_step = [], []
+        for ds in datasets:
+            diffable_metric_train, selection_metric_train = eval_opt([ds], inference_mode=(not propagate))
+            if not torch.isnan(diffable_metric_train).any():
+                if propagate and diffable_metric_train.requires_grad == True:
+                    diffable_metric_train.backward()
+                selection_metrics_this_step += [selection_metric_train]
+                diffable_metrics_this_step += [float(diffable_metric_train.detach().cpu().numpy())]
+        diffable_metric_train = np.nanmean(diffable_metrics_this_step)
+        selection_metric_train = np.nanmean(selection_metrics_this_step)
+        return diffable_metric_train, selection_metric_train
+    for t in tqdm(range(steps), desc='Iterate over Optimization steps'):
+        optimizer.zero_grad()
+        # Select subset of datasets
+        random.seed(t)
+        train_datasets_ = random.sample(train_datasets, N_datasets_sampled)
+        # Get score on train
+        diffable_metric_train, selection_metric_train = eval_all_datasets(train_datasets_, propagate=True)
+        optimization_route_selection += [float(selection_metric_train)]
+        optimization_route_diffable += [float(diffable_metric_train)]
+        # Get score on valid
+        diffable_metric_valid, selection_metric_valid = eval_all_datasets(valid_datasets, propagate=False)
+        optimization_route_selection_valid += [float(selection_metric_valid)]
+        optimization_route_diffable_valid += [float(diffable_metric_valid)]
+        is_best = (selection_metric_min_max == 'min' and best_selection_metric > selection_metric_valid)
+        is_best = is_best or (selection_metric_min_max == 'max' and best_selection_metric < selection_metric_valid)
+        if (best_selection_metric is None) or (not np.isnan(selection_metric_valid) and is_best):
+            print('New best', best_selection_metric, selection_metric_valid)
+            best_style = grad_style.detach().clone()
+            best_temperature = softmax_temperature.detach().clone()
+            best_selection_metric, best_diffable_metric = selection_metric_valid, diffable_metric_valid
+        optimizer.step()
+        if limit_style:
+            grad_style = grad_style.detach().clamp(-1.74, 1.74)
+        print(f'Valid: Diffable metric={diffable_metric_valid} Selection metric={selection_metric_valid};' +
+            f'Train: Diffable metric={diffable_metric_train} Selection metric={selection_metric_train}')
+    print(f'Return best:{best_style} {best_selection_metric}')
+    return {'best_style': best_style, 'best_temperature': best_temperature
+            , 'optimization_route': {'select': optimization_route_selection, 'loss': optimization_route_diffable,
+               'test_select': optimization_route_selection_valid, 'test_loss': optimization_route_diffable_valid}}

TabPFN/encoders.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import math
+import torch
+import torch.nn as nn
+from utils import normalize_data
+import torch.nn.functional as F
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+class StyleEncoder(nn.Module):
+    def __init__(self, em_size, hyperparameter_definitions):
+        super().__init__()
+        # self.embeddings = {}
+        self.em_size = em_size
+        # self.hyperparameter_definitions = {}
+        # for hp in hyperparameter_definitions:
+        #     self.embeddings[hp] = nn.Linear(1, self.em_size)
+        # self.embeddings = nn.ModuleDict(self.embeddings)
+        self.embedding = nn.Linear(hyperparameter_definitions.shape[0], self.em_size)
+    def forward(self, hyperparameters):  # T x B x num_features
+        # Make faster by using matrices
+        # sampled_embeddings = [torch.stack([
+        #     self.embeddings[hp](torch.tensor([batch[hp]], device=self.embeddings[hp].weight.device, dtype=torch.float))
+        #     for hp in batch
+        # ], -1).sum(-1) for batch in hyperparameters]
+        # return torch.stack(sampled_embeddings, 0)
+        return self.embedding(hyperparameters)
+class _PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        self.d_model = d_model
+        self.device_test_tensor = nn.Parameter(torch.tensor(1.))
+    def forward(self, x):# T x B x num_features
+        assert self.d_model % x.shape[-1]*2 == 0
+        d_per_feature = self.d_model // x.shape[-1]
+        pe = torch.zeros(*x.shape, d_per_feature, device=self.device_test_tensor.device)
+        #position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        interval_size = 10
+        div_term = (1./interval_size) * 2*math.pi*torch.exp(torch.arange(0, d_per_feature, 2, device=self.device_test_tensor.device).float()*math.log(math.sqrt(2)))
+        #print(div_term/2/math.pi)
+        pe[..., 0::2] = torch.sin(x.unsqueeze(-1) * div_term)
+        pe[..., 1::2] = torch.cos(x.unsqueeze(-1) * div_term)
+        return self.dropout(pe).view(x.shape[0],x.shape[1],self.d_model)
+Positional = lambda _, emsize: _PositionalEncoding(d_model=emsize)
+class EmbeddingEncoder(nn.Module):
+    def __init__(self, num_features, em_size, num_embs=100):
+        super().__init__()
+        self.num_embs = num_embs
+        self.embeddings = nn.Embedding(num_embs * num_features, em_size, max_norm=True)
+        self.init_weights(.1)
+        self.min_max = (-2,+2)
+    @property
+    def width(self):
+        return self.min_max[1] - self.min_max[0]
+    def init_weights(self, initrange):
+        self.embeddings.weight.data.uniform_(-initrange, initrange)
+    def discretize(self, x):
+        split_size = self.width / self.num_embs
+        return (x - self.min_max[0] // split_size).int().clamp(0, self.num_embs - 1)
+    def forward(self, x):  # T x B x num_features
+        x_idxs = self.discretize(x)
+        x_idxs += torch.arange(x.shape[-1], device=x.device).view(1, 1, -1) * self.num_embs
+        # print(x_idxs,self.embeddings.weight.shape)
+        return self.embeddings(x_idxs).mean(-2)
+class Normalize(nn.Module):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+    def forward(self, x):
+        return (x-self.mean)/self.std
+def get_normalized_uniform_encoder(encoder_creator):
+    """
+    This can be used to wrap an encoder that is fed uniform samples in [0,1] and normalizes these to 0 mean and 1 std.
+    For example, it can be used as `encoder_creator = get_normalized_uniform_encoder(encoders.Linear)`, now this can
+    be initialized with `encoder_creator(feature_dim, in_dim)`.
+    :param encoder:
+    :return:
+    """
+    return lambda in_dim, out_dim: nn.Sequential(Normalize(.5, math.sqrt(1/12)), encoder_creator(in_dim, out_dim))
+Linear = nn.Linear
+MLP = lambda num_features, emsize: nn.Sequential(nn.Linear(num_features+1,emsize*2),
+                                                 nn.ReLU(),
+                                                 nn.Linear(emsize*2,emsize))
+class NanHandlingEncoder(nn.Module):
+    def __init__(self, num_features, emsize, keep_nans=True):
+        super().__init__()
+        self.num_features = 2 * num_features if keep_nans else num_features
+        self.emsize = emsize
+        self.keep_nans = keep_nans
+        self.layer = nn.Linear(self.num_features, self.emsize)
+    def forward(self, x):
+        if self.keep_nans:
+            x = torch.cat([torch.nan_to_num(x, nan=0.0), normalize_data(torch.isnan(x) * -1
+                                                          + torch.logical_and(torch.isinf(x), torch.sign(x) == 1) * 1
+                                                          + torch.logical_and(torch.isinf(x), torch.sign(x) == -1) * 2
+                                                          )], -1)
+        else:
+            x = torch.nan_to_num(x, nan=0.0)
+        return self.layer(x)
+class Linear(nn.Linear):
+    def __init__(self, num_features, emsize):
+        super().__init__(num_features, emsize)
+        self.num_features = num_features
+        self.emsize = emsize
+    def forward(self, x):
+        x = torch.nan_to_num(x, nan=0.0)
+        return super().forward(x)
+class SequenceSpanningEncoder(nn.Module):
+    # Regular Encoder transforms Seq_len, B, S -> Seq_len, B, E attending only to last dimension
+    # This Encoder accesses the Seq_Len dimension additionally
+    # Why would we want this? We can learn normalization and embedding of features
+    #    , this might be more important for e.g. categorical, ordinal feats, nan detection
+    # However maybe this can be easily learned through transformer as well?
+    # A problem is to make this work across any sequence length and be independent of ordering
+    # We could use average and maximum pooling and use those with a linear layer
+    # Another idea !! Similar to this we would like to encode features so that their number is variable
+    # We would like to embed features, also using knowledge of the features in the entire sequence
+    # We could use convolution or another transformer
+    # Convolution:
+    # Transformer/Conv across sequence dimension that encodes and normalizes features
+    #    -> Transformer across feature dimension that encodes features to a constant size
+    # Conv with flexible features but no sequence info: S,B,F -(reshape)-> S*B,1,F
+    #   -(Conv1d)-> S*B,N,F -(AvgPool,MaxPool)-> S*B,N,1 -> S,B,N
+    # This probably won't work since it's missing a way to recognize which feature is encoded
+    # Transformer with flexible features: S,B,F -> F,B*S,1 -> F2,B*S,1 -> S,B,F2
+    def __init__(self, num_features, em_size):
+        super().__init__()
+        raise NotImplementedError()
+        # Seq_len, B, S -> Seq_len, B, E
+        #
+        self.convs = torch.nn.ModuleList([nn.Conv1d(64 if i else 1, 64, 3) for i in range(5)])
+        # self.linear = nn.Linear(64, emsize)
+class TransformerBasedFeatureEncoder(nn.Module):
+    def __init__(self, num_features, emsize):
+        super().__init__()
+        hidden_emsize = emsize
+        encoder = Linear(1, hidden_emsize)
+        n_out = emsize
+        nhid = 2*emsize
+        dropout =0.0
+        nhead=4
+        nlayers=4
+        model = nn.Transformer(nhead=nhead, num_encoder_layers=4, num_decoder_layers=4, d_model=1)
+    def forward(self, *input):
+        # S,B,F -> F,S*B,1 -> F2,S*B,1 -> S,B,F2
+        input = input.transpose()
+        self.model(input)
+class Conv(nn.Module):
+    def __init__(self, input_size, emsize):
+        super().__init__()
+        self.convs = torch.nn.ModuleList([nn.Conv2d(64 if i else 1, 64, 3) for i in range(5)])
+        self.linear = nn.Linear(64,emsize)
+    def forward(self, x):
+        size = math.isqrt(x.shape[-1])
+        assert size*size == x.shape[-1]
+        x = x.reshape(*x.shape[:-1], 1, size, size)
+        for conv in self.convs:
+            if x.shape[-1] < 4:
+                break
+            x = conv(x)
+            x.relu_()
+        x = nn.AdaptiveAvgPool2d((1,1))(x).squeeze(-1).squeeze(-1)
+        return self.linear(x)
+class CanEmb(nn.Embedding):
+    def __init__(self, num_features, num_embeddings: int, embedding_dim: int, *args, **kwargs):
+        assert embedding_dim % num_features == 0
+        embedding_dim = embedding_dim // num_features
+        super().__init__(num_embeddings, embedding_dim, *args, **kwargs)
+    def forward(self, x):
+        lx = x.long()
+        assert (lx == x).all(), "CanEmb only works with tensors of whole numbers"
+        x = super().forward(lx)
+        return x.view(*x.shape[:-2], -1)
+def get_Canonical(num_classes):
+    return lambda num_features, emsize: CanEmb(num_features, num_classes, emsize)
+def get_Embedding(num_embs_per_feature=100):
+    return lambda num_features, emsize: EmbeddingEncoder(num_features, emsize, num_embs=num_embs_per_feature)

TabPFN/initializers.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from torch import nn
+def get_NormalInitializer(std):
+    def initializer(m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, 0, std)
+            nn.init.normal_(m.bias, 0, std)
+    return initializer

TabPFN/layer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from functools import partial
+from torch import nn
+from torch.nn.modules.transformer import *
+from torch.nn.modules.transformer import _get_activation_fn
+from torch.utils.checkpoint import checkpoint
+class TransformerEncoderLayer(Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+    This standard encoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of intermediate layer, relu or gelu (default=relu).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``.
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+    """
+    __constants__ = ['batch_first']
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
+                 layer_norm_eps=1e-5, batch_first=False, pre_norm=False,
+                 device=None, dtype=None, recompute_attn=False) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                            **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.pre_norm = pre_norm
+        self.recompute_attn = recompute_attn
+        self.activation = _get_activation_fn(activation)
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super().__setstate__(state)
+    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        if self.pre_norm:
+            src_ = self.norm1(src)
+        else:
+            src_ = src
+        if isinstance(src_mask, tuple):
+            # global attention setup
+            assert not self.self_attn.batch_first
+            assert src_key_padding_mask is None
+            global_src_mask, trainset_src_mask, valset_src_mask = src_mask
+            num_global_tokens = global_src_mask.shape[0]
+            num_train_tokens = trainset_src_mask.shape[0]
+            global_tokens_src = src_[:num_global_tokens]
+            train_tokens_src = src_[num_global_tokens:num_global_tokens+num_train_tokens]
+            global_and_train_tokens_src = src_[:num_global_tokens+num_train_tokens]
+            eval_tokens_src = src_[num_global_tokens+num_train_tokens:]
+            attn = partial(checkpoint, self.self_attn) if self.recompute_attn else self.self_attn
+            global_tokens_src2 = attn(global_tokens_src, global_and_train_tokens_src, global_and_train_tokens_src, None, True, global_src_mask)[0]
+            train_tokens_src2 = attn(train_tokens_src, global_tokens_src, global_tokens_src, None, True, trainset_src_mask)[0]
+            eval_tokens_src2 = attn(eval_tokens_src, src_, src_,
+                                    None, True, valset_src_mask)[0]
+            src2 = torch.cat([global_tokens_src2, train_tokens_src2, eval_tokens_src2], dim=0)
+        else:
+            if self.recompute_attn:
+                src2 = checkpoint(self.self_attn, src_, src_, src_, src_key_padding_mask, True, src_mask)[0]
+            else:
+                src2 = self.self_attn(src_, src_, src_, attn_mask=src_mask,
+                                      key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        if not self.pre_norm:
+            src = self.norm1(src)
+        if self.pre_norm:
+            src_ = self.norm2(src)
+        else:
+            src_ = src
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src_))))
+        src = src + self.dropout2(src2)
+        if not self.pre_norm:
+            src = self.norm2(src)
+        return src

TabPFN/losses.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+from torch import nn
+class CrossEntropyForMulticlassLoss(torch.nn.CrossEntropyLoss):
+    # This loss applies cross entropy after reducing the number of prediction
+    # dimensions to the number of classes in the target
+    # TODO: loss.item() doesn't work so the displayed losses are Nans
+    def __init__(self, num_classes, weight=None, size_average=None, ignore_index: int = -100,
+                 reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
+        super().__init__(size_average=size_average, reduce=reduce, reduction=reduction, ignore_index=ignore_index)
+        self.num_classes = num_classes
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        loss = torch.zeros_like(input[:, :, 0])
+        for b in range(target.shape[1]):
+            l = super().forward(input[:, b, 0:len(torch.unique(target[:, b]))], target[:, b])
+            loss[:, b] += l
+        return loss.flatten()
+def JointBCELossWithLogits(output, target):
+    # output shape: (S, B, NS) with NS = Number of sequences
+    # target shape: (S, B, SL)
+    # Loss = -log(mean_NS(prod_SL(p(target_SL, output_NS))))
+    # Here at the moment NS = SL
+    output = output.unsqueeze(-1).repeat(1, 1, 1, target.shape[-1]) # (S, B, NS, SL)
+    output = output.permute(2, 0, 1, 3) # (NS, S, B, SL)
+    print(target.shape, output.shape)
+    loss = (target * torch.sigmoid(output)) + ((1-target) * (1-torch.sigmoid(output)))
+    loss = loss.prod(-1)
+    loss = loss.mean(0)
+    loss = -torch.log(loss)
+    loss = loss.mean()
+    return loss
+class ScaledSoftmaxCE(nn.Module):
+    def forward(self, x, label):
+        logits = x[..., :-10]
+        temp_scales = x[..., -10:]
+        logprobs = logits.softmax(-1)

TabPFN/model_builder.py ADDED Viewed

	@@ -0,0 +1,273 @@

+from train import train, Losses
+import priors
+import encoders
+from collections import defaultdict
+from priors.utils import trunc_norm_sampler_f, gamma_sampler_f
+from utils import get_uniform_single_eval_pos_sampler
+import torch
+import math
+def save_model(model, path, filename, config_sample):
+    config_sample = {**config_sample}
+    def make_serializable(config_sample):
+        if isinstance(config_sample, dict):
+            config_sample = {k: make_serializable(config_sample[k]) for k in config_sample}
+        if isinstance(config_sample, list):
+            config_sample = [make_serializable(v) for v in config_sample]
+        if callable(config_sample):
+            config_sample = str(config_sample)
+        return config_sample
+    #if 'num_features_used' in config_sample:
+    #    del config_sample['num_features_used']
+    #config_sample['num_classes_as_str'] = str(config_sample['num_classes'])
+    #del config_sample['num_classes']
+    config_sample = make_serializable(config_sample)
+    torch.save((model.state_dict(), None, config_sample), os.path.join(path, filename))
+import subprocess as sp
+import os
+def get_gpu_memory():
+    command = "nvidia-smi"
+    memory_free_info = sp.check_output(command.split()).decode('ascii')
+    return memory_free_info
+def load_model(path, filename, device, eval_positions, verbose):
+    # TODO: This function only restores evaluation functionality but training canät be continued. It is also not flexible.
+    model_state, optimizer_state, config_sample = torch.load(
+        os.path.join(path, filename), map_location='cpu')
+    if ('differentiable_hyperparameters' in config_sample
+            and 'prior_mlp_activations' in config_sample['differentiable_hyperparameters']):
+        config_sample['differentiable_hyperparameters']['prior_mlp_activations']['choice_values_used'] = config_sample[
+                                                                                                         'differentiable_hyperparameters'][
+                                                                                                         'prior_mlp_activations'][
+                                                                                                         'choice_values']
+        config_sample['differentiable_hyperparameters']['prior_mlp_activations']['choice_values'] = [
+            torch.nn.Tanh for k in config_sample['differentiable_hyperparameters']['prior_mlp_activations']['choice_values']]
+    config_sample['categorical_features_sampler'] = lambda: lambda x: ([], [], [])
+    config_sample['num_features_used_in_training'] = config_sample['num_features_used']
+    config_sample['num_features_used'] = lambda: config_sample['num_features']
+    config_sample['num_classes_in_training'] = config_sample['num_classes']
+    config_sample['num_classes'] = 2
+    config_sample['batch_size_in_training'] = config_sample['batch_size']
+    config_sample['batch_size'] = 1
+    config_sample['bptt_in_training'] = config_sample['bptt']
+    config_sample['bptt'] = 10
+    config_sample['bptt_extra_samples_in_training'] = config_sample['bptt_extra_samples']
+    config_sample['bptt_extra_samples'] = None
+    #print('Memory', str(get_gpu_memory()))
+    model = get_model(config_sample, device=device, should_train=False, verbose=verbose)
+    module_prefix = 'module.'
+    model_state = {k.replace(module_prefix, ''): v for k, v in model_state.items()}
+    model[2].load_state_dict(model_state)
+    model[2].to(device)
+    return model, config_sample
+def fix_loaded_config_sample(loaded_config_sample, config):
+    def copy_to_sample(*k):
+        t,s = loaded_config_sample, config
+        for k_ in k[:-1]:
+            t = t[k_]
+            s = s[k_]
+        t[k[-1]] = s[k[-1]]
+    copy_to_sample('num_features_used')
+    copy_to_sample('num_classes')
+    copy_to_sample('differentiable_hyperparameters','prior_mlp_activations','choice_values')
+def load_config_sample(path, template_config):
+    model_state, optimizer_state, loaded_config_sample = torch.load(path, map_location='cpu')
+    fix_loaded_config_sample(loaded_config_sample, template_config)
+    return loaded_config_sample
+def get_default_spec(test_datasets, valid_datasets):
+    bptt = 10000
+    eval_positions = [1000, 2000, 3000, 4000, 5000] # list(2 ** np.array([4, 5, 6, 7, 8, 9, 10, 11, 12]))
+    max_features = max([X.shape[1] for (_, X, _, _, _, _) in test_datasets] + [X.shape[1] for (_, X, _, _, _, _) in valid_datasets])
+    max_splits = 5
+    return bptt, eval_positions, max_features, max_splits
+def get_mlp_prior_hyperparameters(config):
+    config = {hp: (list(config[hp].values())[0]) if type(config[hp]) is dict else config[hp] for hp in config}
+    if "prior_sigma_gamma_k" in config:
+        sigma_sampler = gamma_sampler_f(config["prior_sigma_gamma_k"], config["prior_sigma_gamma_theta"])
+        config['init_std'] = sigma_sampler
+    if "prior_noise_std_gamma_k" in config:
+        noise_std_sampler = gamma_sampler_f(config["prior_noise_std_gamma_k"], config["prior_noise_std_gamma_theta"])
+        config['noise_std'] = noise_std_sampler
+    return config
+def get_gp_mix_prior_hyperparameters(config):
+    return {'lengthscale_concentration': config["prior_lengthscale_concentration"],
+            'nu': config["prior_nu"],
+            'outputscale_concentration': config["prior_outputscale_concentration"],
+            'categorical_data': config["prior_y_minmax_norm"],
+            'y_minmax_norm': config["prior_lengthscale_concentration"],
+            'noise_concentration': config["prior_noise_concentration"],
+            'noise_rate': config["prior_noise_rate"]}
+def get_gp_prior_hyperparameters(config):
+    return {hp: (list(config[hp].values())[0]) if type(config[hp]) is dict else config[hp] for hp in config}
+def get_meta_gp_prior_hyperparameters(config):
+    config = {hp: (list(config[hp].values())[0]) if type(config[hp]) is dict else config[hp] for hp in config}
+    if "outputscale_mean" in config:
+        outputscale_sampler = trunc_norm_sampler_f(config["outputscale_mean"]
+                                                   , config["outputscale_mean"] * config["outputscale_std_f"])
+        config['outputscale'] = outputscale_sampler
+    if "lengthscale_mean" in config:
+        lengthscale_sampler = trunc_norm_sampler_f(config["lengthscale_mean"],
+                                                   config["lengthscale_mean"] * config["lengthscale_std_f"])
+        config['lengthscale'] = lengthscale_sampler
+    return config
+def get_model(config, device, should_train=True, verbose=False, state_dict=None, epoch_callback=None):
+    extra_kwargs = {}
+    verbose_train, verbose_prior = verbose >= 1, verbose >= 2
+    config['verbose'] = verbose_prior
+    if 'aggregate_k_gradients' not in config or config['aggregate_k_gradients'] is None:
+        config['aggregate_k_gradients'] = math.ceil(config['batch_size'] * ((config['nlayers'] * config['emsize'] * config['bptt'] * config['bptt']) / 10824640000))
+    config['num_steps'] = math.ceil(config['num_steps'] * config['aggregate_k_gradients'])
+    config['batch_size'] = math.ceil(config['batch_size'] / config['aggregate_k_gradients'])
+    config['recompute_attn'] = config['recompute_attn'] if 'recompute_attn' in config else False
+    def make_get_batch(model_proto, **extra_kwargs):
+        extra_kwargs = defaultdict(lambda: None, **extra_kwargs)
+        return (lambda batch_size, seq_len, num_features, hyperparameters
+                , device, model_proto=model_proto, get_batch=extra_kwargs['get_batch']
+                       , prior_bag_priors=extra_kwargs['prior_bag_priors']: model_proto.get_batch(
+            batch_size=batch_size
+            , seq_len=seq_len
+            , device=device
+            , get_batch=get_batch
+            , hyperparameters=hyperparameters
+            , num_features=num_features))
+    if config['prior_type'] == 'prior_bag':
+        # Prior bag combines priors
+        get_batch_gp = make_get_batch(priors.fast_gp)
+        get_batch_mlp = make_get_batch(priors.mlp)
+        if 'flexible' in config and config['flexible']:
+            get_batch_gp = make_get_batch(priors.flexible_categorical, **{'get_batch': get_batch_gp})
+            get_batch_mlp = make_get_batch(priors.flexible_categorical, **{'get_batch': get_batch_mlp})
+        prior_bag_hyperparameters = {'prior_bag_get_batch': (get_batch_gp, get_batch_mlp)
+            , 'prior_bag_exp_weights_1': 2.0}
+        prior_hyperparameters = {**get_mlp_prior_hyperparameters(config), **get_gp_prior_hyperparameters(config)
+            , **prior_bag_hyperparameters}
+        model_proto = priors.prior_bag
+    else:
+        if config['prior_type'] == 'mlp':
+            prior_hyperparameters = get_mlp_prior_hyperparameters(config)
+            model_proto = priors.mlp
+        elif config['prior_type'] == 'gp':
+            prior_hyperparameters = get_gp_prior_hyperparameters(config)
+            model_proto = priors.fast_gp
+        elif config['prior_type'] == 'gp_mix':
+            prior_hyperparameters = get_gp_mix_prior_hyperparameters(config)
+            model_proto = priors.fast_gp_mix
+        else:
+            raise Exception()
+        if 'flexible' in config and config['flexible']:
+            get_batch_base = make_get_batch(model_proto)
+            extra_kwargs['get_batch'] = get_batch_base
+            model_proto = priors.flexible_categorical
+    use_style = False
+    if 'differentiable' in config and config['differentiable']:
+        get_batch_base = make_get_batch(model_proto, **extra_kwargs)
+        extra_kwargs = {'get_batch': get_batch_base, 'differentiable_hyperparameters': config['differentiable_hyperparameters']}
+        model_proto = priors.differentiable_prior
+        use_style = True
+    print(f"Using style prior: {use_style}")
+    if (('nan_prob_no_reason' in config and config['nan_prob_no_reason'] > 0.0) or
+        ('nan_prob_a_reason' in config and config['nan_prob_a_reason'] > 0.0) or
+        ('nan_prob_unknown_reason' in config and config['nan_prob_unknown_reason'] > 0.0)):
+        encoder = encoders.NanHandlingEncoder
+    else:
+        encoder = encoders.Linear
+    num_outputs = config['num_outputs'] if 'num_outputs' in config else 1
+    if config['max_num_classes'] == 2:
+        if 'joint_loss' in config and config['joint_loss']:
+            loss = JointBCELossWithLogits
+        else:
+            loss = Losses.bce
+    elif config['max_num_classes'] > 2:
+        loss = Losses.ce(torch.ones((config['max_num_classes'])))
+    else:
+        loss = BarDistribution(borders=get_bucket_limits(500, full_range=(-10, 10)))
+    aggregate_k_gradients = 1 if 'aggregate_k_gradients' not in config else config['aggregate_k_gradients']
+    check_is_compatible = False if 'multiclass_loss_type' not in config else (config['multiclass_loss_type'] == 'compatible')
+    config['multiclass_type'] = config['multiclass_type'] if 'multiclass_type' in config else 'rank'
+    config['mix_activations'] = config['mix_activations'] if 'mix_activations' in config else False
+    config['bptt_extra_samples'] = config['bptt_extra_samples'] if 'bptt_extra_samples' in config else None
+    config['eval_positions'] = [int(config['bptt'] * 0.95)] if config['bptt_extra_samples'] is None else [int(config['bptt'])]
+    epochs = 0 if not should_train else config['epochs']
+    model = train(model_proto.DataLoader
+                  , loss
+                  , encoder
+                  , style_encoder_generator = encoders.StyleEncoder if use_style else None
+                  , emsize=config['emsize']
+                  , nhead=config['nhead']
+                  , y_encoder_generator= encoders.get_Canonical(config['max_num_classes']) if config.get('canonical_y_encoder', False) else encoders.Linear
+                  , pos_encoder_generator=None
+                  , batch_size=config['batch_size']
+                  , nlayers=config['nlayers']
+                  , nhid=config['emsize'] * config['nhid_factor']
+                  , epochs=epochs
+                  , total_available_time_in_s=config.get('total_available_time_in_s', None)
+                  , warmup_epochs=20
+                  , bptt=config['bptt']
+                  , gpu_device=device
+                  , dropout=config['dropout']
+                  , steps_per_epoch=config['num_steps']
+                  , single_eval_pos_gen=get_uniform_single_eval_pos_sampler(config['bptt'])
+                  , load_weights_from_this_state_dict=state_dict
+                  , aggregate_k_gradients=aggregate_k_gradients
+                  , check_is_compatible=check_is_compatible
+                  , recompute_attn=config['recompute_attn']
+                  , epoch_callback=epoch_callback
+                  , bptt_extra_samples = config['bptt_extra_samples']
+                  , extra_prior_kwargs_dict={
+            'num_features': config['num_features']
+            , 'fuse_x_y': False
+            , 'hyperparameters': prior_hyperparameters
+            , 'num_outputs':num_outputs
+            , 'dynamic_batch_size': 1 if ('num_global_att_tokens' in config and config['num_global_att_tokens']) else 2
+            , **extra_kwargs
+        }
+                  , lr=config['lr']
+                  , verbose=verbose_train,
+                  weight_decay=config.get('weight_decay', 0.0),
+                  normalize_labels=True)
+    return model

TabPFN/models_diff/gp_ablation_model.cpkt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7b0c8febc553cca3fdee265b5a1cd7567dbf83da855969940be4707a9218ffb
+size 69460013

TabPFN/models_diff/prior_diff_real_checkpoint_n_8x_lr0.0003_epoch_49.cpkt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dae97f45bd53d719fc2b23fac4ec55eab16d63892196d939b1bb1c3b408be242
+size 103616779

TabPFN/notebook_utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from pathlib import Path
+import io
+import torch
+import pickle
+def print_models(base_path, model_string):
+    print(model_string)
+    for i in range(80):
+        for e in range(50):
+            exists = Path(os.path.join(base_path, f'models_diff/prior_diff_real_checkpoint{model_string}_n_{i}_epoch_{e}.cpkt')).is_file()
+            if exists:
+                print(os.path.join(base_path, f'models_diff/prior_diff_real_checkpoint{model_string}_n_{i}_epoch_{e}.cpkt'))
+        print()
+class CustomUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if name == 'Manager':
+            from settings import Manager
+            return Manager
+        try:
+            return self.find_class_cpu(module, name)
+        except:
+            return None
+    def find_class_cpu(self, module, name):
+        if module == 'torch.storage' and name == '_load_from_bytes':
+            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
+        else:
+            return super().find_class(module, name)

TabPFN/positional_encodings.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import math
+import torch
+from torch import nn
+# Protocol for positonal encodings.
+# __init__(d_model, max_len=..[, more optionals])
+# forward(x: (seq_len, bs, d_model)) -> Tensor of shape (*x.shape[:2],d_model) containing pos. embeddings
+class NoPositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=None):
+        super(NoPositionalEncoding, self).__init__()
+        pass
+    def forward(self, x):
+        return x #* math.sqrt(x.shape[-1])
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = self.pe[:x.size(0), :] + x # * math.sqrt(x.shape[-1])
+        return x
+class LearnedPositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(LearnedPositionalEncoding, self).__init__()
+        self.max_seq_len = max_len
+        #self.positional_embeddings = nn.Embedding(max_len, d_model)
+        self.positional_embeddings = nn.Parameter(torch.empty(max_len, d_model))
+        nn.init.normal_(self.positional_embeddings, mean=0, std=d_model ** -0.5)
+    def forward(self, x):
+        seq_len, bs, d_model = x.shape
+        assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
+        pos_emb = self.positional_embeddings[:seq_len]
+        return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])
+class PairedScrambledPositionalEncodings(LearnedPositionalEncoding):
+    # TODO check whether it is a problem to use the same perm. for full batch
+    def forward(self, x):
+        seq_len, bs, d_model = x.shape
+        assert seq_len <= len(self.positional_embeddings), 'seq_len can be at most max_len.'
+        assert len(self.positional_embeddings) % 2 == 0, 'Please specify an even max_len.'
+        paired_embs = self.positional_embeddings.view(len(self.positional_embeddings), -1, 2)
+        pos_emb = paired_embs[torch.randperm(len(paired_embs))].view(*self.positional_embeddings.shape)[:seq_len]
+        return pos_emb.unsqueeze(1).expand(seq_len, bs, d_model) + x #* math.sqrt(x.shape[-1])

TabPFN/prior_tuning_result.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24d2189bbc836aeea888cf6c540f2c1b45b5351822931189e8bf10a0bc80a0b6
+size 18668851

TabPFN/priors/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from . import fast_gp, mlp, flexible_categorical, differentiable_prior, prior_bag
2	+
3	+
4	+

TabPFN/priors/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (286 Bytes). View file

TabPFN/priors/__pycache__/differentiable_prior.cpython-39.pyc ADDED Viewed

Binary file (15.6 kB). View file

TabPFN/priors/__pycache__/fast_gp.cpython-39.pyc ADDED Viewed

Binary file (4.5 kB). View file

TabPFN/priors/__pycache__/flexible_categorical.cpython-39.pyc ADDED Viewed

Binary file (8.77 kB). View file

TabPFN/priors/__pycache__/mlp.cpython-39.pyc ADDED Viewed

Binary file (6.78 kB). View file

TabPFN/priors/__pycache__/prior.cpython-39.pyc ADDED Viewed

Binary file (370 Bytes). View file

TabPFN/priors/__pycache__/prior_bag.cpython-39.pyc ADDED Viewed

Binary file (1.52 kB). View file

TabPFN/priors/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (7.71 kB). View file

TabPFN/priors/differentiable_prior.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import torch
+from torch import nn
+import math
+from .utils import get_batch_to_dataloader
+from utils import default_device
+from .utils import order_by_y, normalize_by_used_features_f
+from .utils import trunc_norm_sampler_f, beta_sampler_f, gamma_sampler_f, uniform_sampler_f, zipf_sampler_f, scaled_beta_sampler_f, uniform_int_sampler_f
+def unpack_dict_of_tuples(d):
+    # Returns list of dicts where each dict i contains values of tuple position i
+    # {'a': (1,2), 'b': (3,4)} -> [{'a': 1, 'b': 3}, {'a': 2, 'b': 4}]
+    return [dict(zip(d.keys(), v)) for v in list(zip(*list(d.values())))]
+class DifferentiableHyperparameter(nn.Module):
+    ## We can sample this and get a hyperparameter value and a normalized hyperparameter indicator
+    def __init__(self, distribution, embedding_dim, device, **args):
+        super(DifferentiableHyperparameter, self).__init__()
+        self.distribution = distribution
+        self.embedding_dim = embedding_dim
+        self.device=device
+        for key in args:
+            setattr(self, key, args[key])
+        def get_sampler():
+            #if self.distribution == "beta":
+            #    return beta_sampler_f(self.a, self.b), 0, 1
+            #elif self.distribution == "gamma":
+            #    return gamma_sampler_f(self.a, self.b), 0, 1
+            #elif self.distribution == "beta_int":
+            #    return scaled_beta_sampler_f(self.a, self.b, self.scale, self.min), self.scale + self.min, self.min, self.a / (self.a + self.b)
+            if self.distribution == "uniform":
+                if not hasattr(self, 'sample'):
+                    return uniform_sampler_f(self.min, self.max), self.min, self.max, (self.max+self.min) / 2, math.sqrt(1/12*(self.max-self.min)*(self.max-self.min))
+                else:
+                    return lambda: self.sample, self.min, self.max, None, None
+            elif self.distribution == "uniform_int":
+                return uniform_int_sampler_f(self.min, self.max), self.min, self.max, (self.max+self.min) / 2, math.sqrt(1/12*(self.max-self.min)*(self.max-self.min))
+        if self.distribution.startswith("meta"):
+            self.hparams = {}
+            def sample_meta(f):
+                indicators, passed = unpack_dict_of_tuples({hp: self.hparams[hp]() for hp in self.hparams})
+                # sampled_embeddings = list(itertools.chain.from_iterable([sampled_embeddings[k] for k in sampled_embeddings]))
+                meta_passed = f(**passed)
+                return indicators, meta_passed
+            args_passed = {'device': device, 'embedding_dim': embedding_dim}
+            if self.distribution == "meta_beta":
+                ## Truncated normal where std and mean are drawn randomly logarithmically scaled
+                if hasattr(self, 'b') and hasattr(self, 'k'):
+                    self.hparams = {'b': lambda: (None, self.b), 'k': lambda: (None, self.k)}
+                else:
+                    self.hparams = {"b": DifferentiableHyperparameter(distribution="uniform", min=self.min
+                                                                                          , max=self.max, **args_passed)
+                                    , "k": DifferentiableHyperparameter(distribution="uniform", min=self.min
+                                                                                           , max=self.max, **args_passed)}
+                def make_beta(b, k):
+                    return lambda b=b, k=k: self.scale * beta_sampler_f(b, k)()
+                self.sampler = lambda make_beta=make_beta : sample_meta(make_beta)
+            elif self.distribution == "meta_trunc_norm_log_scaled":
+                # these choices are copied down below, don't change these without changing `replace_differentiable_distributions`
+                self.min_std = self.min_std if hasattr(self, 'min_std') else 0.001
+                self.max_std = self.max_std if hasattr(self, 'max_std') else self.max_mean
+                ## Truncated normal where std and mean are drawn randomly logarithmically scaled
+                if not hasattr(self, 'log_mean'):
+                    self.hparams = {"log_mean": DifferentiableHyperparameter(distribution="uniform", min=math.log(self.min_mean)
+                                                                                          , max=math.log(self.max_mean), **args_passed)
+                                    , "log_std": DifferentiableHyperparameter(distribution="uniform", min=math.log(self.min_std)
+                                                                                           , max=math.log(self.max_std), **args_passed)}
+                else:
+                    self.hparams = {'log_mean': lambda: (None, self.log_mean), 'log_std': lambda: (None, self.log_std)}
+                def make_trunc_norm(log_mean, log_std):
+                    return ((lambda : self.lower_bound + round(trunc_norm_sampler_f(math.exp(log_mean), math.exp(log_std))())) if self.round
+                            else (lambda: self.lower_bound + trunc_norm_sampler_f(math.exp(log_mean), math.exp(log_std))()))
+                self.sampler = lambda make_trunc_norm=make_trunc_norm: sample_meta(make_trunc_norm)
+            elif self.distribution == "meta_trunc_norm":
+                self.min_std = self.min_std if hasattr(self, 'min_std') else 0
+                self.max_std = self.max_std if hasattr(self, 'max_std') else self.max_mean
+                self.hparams = {"mean": DifferentiableHyperparameter(distribution="uniform", min=self.min_mean
+                                                                                      , max=self.max_mean, **args_passed)
+                                , "std": DifferentiableHyperparameter(distribution="uniform", min=self.min_std
+                                                                                       , max=self.max_std, **args_passed)}
+                def make_trunc_norm(mean, std):
+                    return ((lambda: self.lower_bound + round(
+                        trunc_norm_sampler_f(math.exp(mean), math.exp(std))())) if self.round
+                            else (
+                        lambda make_trunc_norm=make_trunc_norm: self.lower_bound + trunc_norm_sampler_f(math.exp(mean), math.exp(std))()))
+                self.sampler = lambda : sample_meta(make_trunc_norm)
+            elif self.distribution == "meta_choice":
+                if hasattr(self, 'choice_1_weight'):
+                    self.hparams = {f'choice_{i}_weight': lambda: (None, getattr(self, f'choice_{i}_weight')) for i in range(1, len(self.choice_values))}
+                else:
+                    self.hparams = {f"choice_{i}_weight": DifferentiableHyperparameter(distribution="uniform", min=-5.0
+                                                                                          , max=6.0, **args_passed) for i in range(1, len(self.choice_values))}
+                def make_choice(**choices):
+                    weights = torch.softmax(torch.tensor([1.0] + [choices[i] for i in choices], dtype=torch.float), 0)  # create a tensor of weights
+                    sample = torch.multinomial(weights, 1, replacement=True).numpy()[0]
+                    return self.choice_values[sample]
+                self.sampler = lambda make_choice=make_choice: sample_meta(make_choice)
+            elif self.distribution == "meta_choice_mixed":
+                if hasattr(self, 'choice_1_weight'):
+                    self.hparams = {f'choice_{i}_weight': lambda: (None, getattr(self, f'choice_{i}_weight')) for i in range(1, len(self.choice_values))}
+                else:
+                    self.hparams = {f"choice_{i}_weight": DifferentiableHyperparameter(distribution="uniform", min=-5.0
+                                                                                          , max=6.0, **args_passed) for i in range(1, len(self.choice_values))}
+                def make_choice(**choices):
+                    weights = torch.softmax(torch.tensor([1.0] + [choices[i] for i in choices], dtype=torch.float), 0)  # create a tensor of weights
+                    def sample():
+                        s = torch.multinomial(weights, 1, replacement=True).numpy()[0]
+                        return self.choice_values[s]()
+                    return lambda: sample
+                self.sampler = lambda make_choice=make_choice: sample_meta(make_choice)
+        else:
+            def return_two(x, min, max, mean, std):
+                # Returns (a hyperparameter value, and an indicator value passed to the model)
+                if mean is not None:
+                    ind = (x-mean)/std#(2 * (x-min) / (max-min) - 1)
+                else:
+                    ind = None
+                return ind, x # normalize indicator to [-1, 1]
+            # def sample_standard(sampler_f, embedding):
+            #     s = torch.tensor([sampler_f()], device = self.device)
+            #     return s, embedding(s)
+            self.sampler_f, self.sampler_min, self.sampler_max, self.sampler_mean, self.sampler_std = get_sampler()
+            self.sampler = lambda : return_two(self.sampler_f(), min=self.sampler_min, max=self.sampler_max
+                                               , mean=self.sampler_mean, std=self.sampler_std)
+            # self.embedding_layer = nn.Linear(1, self.embedding_dim, device=self.device)
+            # self.embed = lambda x : self.embedding_layer(
+            #     (x - self.sampler_min) / (self.sampler_max - self.sampler_min))
+            #self.sampler = lambda : sample_standard(self.sampler_f, self.embedding)
+    def forward(self):
+        s, s_passed = self.sampler()
+        return s, s_passed
+class DifferentiableHyperparameterList(nn.Module):
+    def __init__(self, hyperparameters, embedding_dim, device):
+        super().__init__()
+        self.device = device
+        hyperparameters = {k: v for (k, v) in hyperparameters.items() if v}
+        self.hyperparameters = nn.ModuleDict({hp: DifferentiableHyperparameter(embedding_dim = embedding_dim
+                                                                               , name = hp
+                                                                               , device = device, **hyperparameters[hp]) for hp in hyperparameters})
+    def get_hyperparameter_info(self):
+        sampled_hyperparameters_f, sampled_hyperparameters_keys = [], []
+        def append_hp(hp_key, hp_val):
+            sampled_hyperparameters_keys.append(hp_key)
+            # Function remaps hyperparameters from [-1, 1] range to true value
+            s_min, s_max, s_mean, s_std = hp_val.sampler_min, hp_val.sampler_max, hp_val.sampler_mean, hp_val.sampler_std
+            sampled_hyperparameters_f.append((lambda x: (x-s_mean)/s_std, lambda y : (y * s_std)+s_mean))
+            #sampled_hyperparameters_f.append(((lambda x: ((x - s_min) / (s_max - s_min) * (2) - 1)
+            #                                  , (lambda y: ((y + 1) * (1 / 2) * (s_max - s_min) + s_min))))
+        for hp in self.hyperparameters:
+            hp_val = self.hyperparameters[hp]
+            if hasattr(hp_val, 'hparams'):
+                for hp_ in hp_val.hparams:
+                    append_hp(f'{hp}_{hp_}', hp_val.hparams[hp_])
+            else:
+                append_hp(hp, hp_val)
+        return sampled_hyperparameters_keys, sampled_hyperparameters_f
+    def sample_parameter_object(self):
+        sampled_hyperparameters, s_passed = {}, {}
+        for hp in self.hyperparameters:
+            sampled_hyperparameters_, s_passed_ = self.hyperparameters[hp]()
+            s_passed[hp] = s_passed_
+            if isinstance(sampled_hyperparameters_, dict):
+                sampled_hyperparameters_ = {hp + '_' + str(key): val for key, val in sampled_hyperparameters_.items()}
+                sampled_hyperparameters.update(sampled_hyperparameters_)
+            else:
+                sampled_hyperparameters[hp] = sampled_hyperparameters_
+        # s_passed contains the values passed to the get_batch function
+        # sampled_hyperparameters contains the indicator of the sampled value, i.e. only number that describe the sampled object
+        return s_passed, sampled_hyperparameters#self.pack_parameter_object(sampled_embeddings)
+class DifferentiablePrior(torch.nn.Module):
+    def __init__(self, get_batch, hyperparameters, differentiable_hyperparameters, args):
+        super(DifferentiablePrior, self).__init__()
+        self.h = hyperparameters
+        self.args = args
+        self.get_batch = get_batch
+        self.differentiable_hyperparameters = DifferentiableHyperparameterList(differentiable_hyperparameters
+                                                                               , embedding_dim=self.h['emsize']
+                                                                               , device=self.args['device'])
+    def forward(self):
+        # Sample hyperparameters
+        sampled_hyperparameters_passed, sampled_hyperparameters_indicators = self.differentiable_hyperparameters.sample_parameter_object()
+        hyperparameters = {**self.h, **sampled_hyperparameters_passed}
+        x, y, y_ = self.get_batch(hyperparameters=hyperparameters, **self.args)
+        return x, y, y_, sampled_hyperparameters_indicators
+# TODO: Make this a class that keeps objects
+@torch.no_grad()
+def get_batch(batch_size, seq_len, num_features, get_batch
+              , device=default_device, differentiable_hyperparameters={}
+              , hyperparameters=None, batch_size_per_gp_sample=None, **kwargs):
+    batch_size_per_gp_sample = batch_size_per_gp_sample or (min(64, batch_size))
+    num_models = batch_size // batch_size_per_gp_sample
+    assert num_models * batch_size_per_gp_sample == batch_size, f'Batch size ({batch_size}) not divisible by batch_size_per_gp_sample ({batch_size_per_gp_sample})'
+    args = {'device': device, 'seq_len': seq_len, 'num_features': num_features, 'batch_size': batch_size_per_gp_sample}
+    models = [DifferentiablePrior(get_batch, hyperparameters, differentiable_hyperparameters, args) for _ in range(num_models)]
+    sample = sum([[model()] for model in models], [])
+    x, y, y_, hyperparameter_dict = zip(*sample)
+    if 'verbose' in hyperparameters and hyperparameters['verbose']:
+        print('Hparams', hyperparameter_dict[0].keys())
+    hyperparameter_matrix = []
+    for batch in hyperparameter_dict:
+        hyperparameter_matrix.append([batch[hp] for hp in batch])
+    transposed_hyperparameter_matrix = list(zip(*hyperparameter_matrix))
+    assert all([all([hp is None for hp in hp_]) or all([hp is not None for hp in hp_]) for hp_ in transposed_hyperparameter_matrix]), 'it should always be the case that when a hyper-parameter is None, once it is always None'
+    # we remove columns that are only None (i.e. not sampled)
+    hyperparameter_matrix = [[hp for hp in hp_ if hp is not None] for hp_ in hyperparameter_matrix]
+    if len(hyperparameter_matrix[0]) > 0:
+        packed_hyperparameters = torch.tensor(hyperparameter_matrix)
+        packed_hyperparameters = torch.repeat_interleave(packed_hyperparameters, repeats=batch_size_per_gp_sample, dim=0).detach()
+    else:
+        packed_hyperparameters = None
+    x, y, y_, packed_hyperparameters = (torch.cat(x, 1).detach()
+                                        , torch.cat(y, 1).detach()
+                                        , torch.cat(y_, 1).detach()
+                                        , packed_hyperparameters)#list(itertools.chain.from_iterable(itertools.repeat(x, batch_size_per_gp_sample) for x in packed_hyperparameters)))#torch.repeat_interleave(torch.stack(packed_hyperparameters, 0).detach(), repeats=batch_size_per_gp_sample, dim=0))
+    return x, y, y_, packed_hyperparameters
+DataLoader = get_batch_to_dataloader(get_batch)
+DataLoader.num_outputs = 1
+#DataLoader.validate = lambda : 0
+def draw_random_style(dl, device):
+    (hp_embedding, data, targets_), targets = next(iter(dl))
+    return hp_embedding.to(device)[0:1, :]
+def merge_style_with_info(diff_hparams_keys, diff_hparams_f, style, transform=True):
+    params = dict(zip(diff_hparams_keys, zip(diff_hparams_f, style.detach().cpu().numpy().tolist()[0])))
+    def t(v):
+        if transform:
+            return v[0][1](v[1])
+        else:
+            return v[1]
+    return {k : t(v) for k, v in params.items()}
+import ConfigSpace.hyperparameters as CSH
+def replace_differentiable_distributions(config):
+    diff_config = config['differentiable_hyperparameters']
+    for name, diff_hp_dict in diff_config.items():
+        distribution = diff_hp_dict['distribution']
+        if distribution == 'uniform':
+            diff_hp_dict['sample'] = CSH.UniformFloatHyperparameter(name, diff_hp_dict['min'], diff_hp_dict['max'])
+        elif distribution == 'meta_beta':
+            diff_hp_dict['k'] = CSH.UniformFloatHyperparameter(name+'_k', diff_hp_dict['min'], diff_hp_dict['max'])
+            diff_hp_dict['b'] = CSH.UniformFloatHyperparameter(name+'_b', diff_hp_dict['min'], diff_hp_dict['max'])
+        elif distribution == 'meta_choice':
+            for i in range(1, len(diff_hp_dict['choice_values'])):
+                diff_hp_dict[f'choice_{i}_weight'] = CSH.UniformFloatHyperparameter(name+f'choice_{i}_weight', -5.0, 6.0)
+        elif distribution == 'meta_choice_mixed':
+            for i in range(1, len(diff_hp_dict['choice_values'])):
+                diff_hp_dict[f'choice_{i}_weight'] = CSH.UniformFloatHyperparameter(name+f'choice_{i}_weight', -5.0, 6.0)
+        elif distribution == 'meta_trunc_norm_log_scaled':
+            diff_hp_dict['log_mean'] = CSH.UniformFloatHyperparameter(name+'_log_mean', math.log(diff_hp_dict['min_mean']), math.log(diff_hp_dict['max_mean']))
+            min_std = diff_hp_dict['min_std'] if 'min_std' in diff_hp_dict else 0.001
+            max_std = diff_hp_dict['max_std'] if 'max_std' in diff_hp_dict else diff_hp_dict['max_mean']
+            diff_hp_dict['log_std'] = CSH.UniformFloatHyperparameter(name+'_log_std', math.log(min_std), math.log(max_std))
+        else:
+            raise ValueError(f'Unknown distribution {distribution}')

TabPFN/priors/fast_gp.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import time
+import torch
+from torch import nn
+import gpytorch
+from .utils import get_batch_to_dataloader
+from utils import default_device
+# We will use the simplest form of GP model, exact inference
+class ExactGPModel(gpytorch.models.ExactGP):
+    def __init__(self, train_x, train_y, likelihood):
+        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
+        self.mean_module = gpytorch.means.ConstantMean()
+        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
+    def forward(self, x):
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+def get_model(x, y, hyperparameters):
+    likelihood = gpytorch.likelihoods.GaussianLikelihood(noise_constraint=gpytorch.constraints.GreaterThan(1.e-9))
+    model = ExactGPModel(x, y, likelihood)
+    model.likelihood.noise = torch.ones_like(model.likelihood.noise) * hyperparameters["noise"]
+    model.covar_module.outputscale = torch.ones_like(model.covar_module.outputscale) * hyperparameters["outputscale"]
+    model.covar_module.base_kernel.lengthscale = torch.ones_like(model.covar_module.base_kernel.lengthscale) * \
+                                                 hyperparameters["lengthscale"]
+    return model, likelihood
+@torch.no_grad()
+def get_batch(batch_size, seq_len, num_features, device=default_device, hyperparameters=None,
+              equidistant_x=False, fix_x=None, **kwargs):
+    if isinstance(hyperparameters, (tuple, list)):
+        hyperparameters = {"noise": hyperparameters[0]
+            , "outputscale": hyperparameters[1]
+            , "lengthscale": hyperparameters[2]
+            , "is_binary_classification": hyperparameters[3]
+            # , "num_features_used": hyperparameters[4]
+            , "normalize_by_used_features": hyperparameters[5]
+            , "order_y": hyperparameters[6]
+            , "sampling": hyperparameters[7]
+                           }
+    elif hyperparameters is None:
+        hyperparameters = {"noise": .1, "outputscale": .1, "lengthscale": .1}
+    if 'verbose' in hyperparameters and hyperparameters['verbose']:
+        print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
+                  , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size, 'sampling': hyperparameters['sampling']})
+    # hyperparameters = {k: hyperparameters[k]() if callable(hyperparameters[k]) else hyperparameters[k] for k in
+    #      hyperparameters.keys()}
+    assert not (equidistant_x and (fix_x is not None))
+    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations', (True, True, True))):
+        if equidistant_x:
+            assert num_features == 1
+            x = torch.linspace(0, 1., seq_len).unsqueeze(0).repeat(batch_size, 1).unsqueeze(-1)
+        elif fix_x is not None:
+            assert fix_x.shape == (seq_len, num_features)
+            x = fix_x.unsqueeze(0).repeat(batch_size, 1, 1).to(device)
+        else:
+            if hyperparameters.get('sampling','uniform') == 'uniform':
+                x = torch.rand(batch_size, seq_len, num_features, device=device)
+            else:
+                x = torch.randn(batch_size, seq_len, num_features, device=device)
+        model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
+        model.to(device)
+        # trained_model = ExactGPModel(train_x, train_y, likelihood).cuda()
+        # trained_model.eval()
+        is_fitted = False
+        while not is_fitted:
+            try:
+                with gpytorch.settings.prior_mode(True):
+                    model, likelihood = get_model(x, torch.Tensor(), hyperparameters)
+                    model.to(device)
+                    d = model(x)
+                    d = likelihood(d)
+                    sample = d.sample().transpose(0, 1)
+                    is_fitted = True
+            except RuntimeError: # This can happen when torch.linalg.eigh fails. Restart with new init resolves this.
+                print('GP Fitting unsuccessful, retrying.. ')
+                print(x)
+                print(hyperparameters)
+    if bool(torch.any(torch.isnan(x)).detach().cpu().numpy()):
+        print({"noise": hyperparameters['noise'], "outputscale": hyperparameters['outputscale']
+                  , "lengthscale": hyperparameters['lengthscale'], 'batch_size': batch_size})
+    # TODO: Multi output
+    return x.transpose(0, 1), sample, sample  # x.shape = (T,B,H)
+DataLoader = get_batch_to_dataloader(get_batch)
+DataLoader.num_outputs = 1
+def get_model_on_device(x,y,hyperparameters,device):
+    model, likelihood = get_model(x, y, hyperparameters)
+    model.to(device)
+    return model, likelihood
+@torch.no_grad()
+def evaluate(x, y, y_non_noisy, use_mse=False, hyperparameters={}, get_model_on_device=get_model_on_device, device=default_device, step_size=1, start_pos=0):
+    start_time = time.time()
+    losses_after_t = [.0] if start_pos == 0 else []
+    all_losses_after_t = []
+    with gpytorch.settings.fast_computations(*hyperparameters.get('fast_computations',(True,True,True))), gpytorch.settings.fast_pred_var(False):
+        for t in range(max(start_pos, 1), len(x), step_size):
+            loss_sum = 0.
+            model, likelihood = get_model_on_device(x[:t].transpose(0, 1), y[:t].transpose(0, 1), hyperparameters, device)
+            model.eval()
+            # print([t.shape for t in model.train_inputs])
+            # print(x[:t].transpose(0,1).shape, x[t].unsqueeze(1).shape, y[:t].transpose(0,1).shape)
+            f = model(x[t].unsqueeze(1))
+            l = likelihood(f)
+            means = l.mean.squeeze()
+            varis = l.covariance_matrix.squeeze()
+            # print(l.variance.squeeze(), l.mean.squeeze(), y[t])
+            assert len(means.shape) == len(varis.shape) == 1
+            assert len(means) == len(varis) == x.shape[1]
+            if use_mse:
+                c = nn.MSELoss(reduction='none')
+                ls = c(means, y[t])
+            else:
+                ls = -l.log_prob(y[t].unsqueeze(1))
+            losses_after_t.append(ls.mean())
+            all_losses_after_t.append(ls.flatten())
+        return torch.stack(all_losses_after_t).to('cpu'), torch.tensor(losses_after_t).to('cpu'), time.time() - start_time
+if __name__ == '__main__':
+    hps = (.1,.1,.1)
+    for redo_idx in range(1):
+        print(
+            evaluate(*get_batch(1000, 10, hyperparameters=hps, num_features=10), use_mse=False, hyperparameters=hps))

TabPFN/priors/flexible_categorical.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import time
+import random
+import torch
+from torch import nn
+from .utils import get_batch_to_dataloader
+from utils import normalize_data, nan_handling_missing_for_unknown_reason_value, nan_handling_missing_for_no_reason_value, nan_handling_missing_for_a_reason_value, to_ranking_low_mem, remove_outliers
+from .utils import normalize_by_used_features_f, randomize_classes, CategoricalActivation
+from .utils import uniform_int_sampler_f
+time_it = False
+class BalancedBinarize(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return (x > torch.median(x)).float()
+def class_sampler_f(min_, max_):
+    def s():
+        if random.random() > 0.5:
+            return uniform_int_sampler_f(min_, max_)()
+        return 2
+    return s
+class MulticlassRank(nn.Module):
+    def __init__(self, num_classes, ordered_p=0.5):
+        super().__init__()
+        self.num_classes = class_sampler_f(2, num_classes)()
+        self.ordered_p = ordered_p
+    def forward(self, x):
+        # x has shape (T,B,H)
+        # CAUTION: This samples the same idx in sequence for each class boundary in a batch
+        class_boundaries = torch.randint(0, x.shape[0], (self.num_classes - 1,))
+        class_boundaries = x[class_boundaries].unsqueeze(1)
+        d = (x > class_boundaries).sum(axis=0)
+        randomized_classes = torch.rand((d.shape[1], )) > self.ordered_p
+        d[:, randomized_classes] = randomize_classes(d[:, randomized_classes], self.num_classes)
+        reverse_classes = torch.rand((d.shape[1],)) > 0.5
+        d[:, reverse_classes] = self.num_classes - 1 - d[:, reverse_classes]
+        return d
+class MulticlassValue(nn.Module):
+    def __init__(self, num_classes, ordered_p=0.5):
+        super().__init__()
+        self.num_classes = class_sampler_f(2, num_classes)()
+        self.classes = nn.Parameter(torch.randn(num_classes-1), requires_grad=False)
+        self.ordered_p = ordered_p
+    def forward(self, x):
+        # x has shape (T,B,H)
+        d = (x > (self.classes.unsqueeze(-1).unsqueeze(-1))).sum(axis=0)
+        randomized_classes = torch.rand((d.shape[1],)) > self.ordered_p
+        d[:, randomized_classes] = randomize_classes(d[:, randomized_classes], self.num_classes)
+        reverse_classes = torch.rand((d.shape[1],)) > 0.5
+        d[:, reverse_classes] = self.num_classes - 1 - d[:, reverse_classes]
+        return d
+class MulticlassMultiNode(nn.Module):
+    def __init__(self, num_classes, ordered_p=0.5):
+        super().__init__()
+        self.num_classes = class_sampler_f(2, num_classes)()
+        self.classes = nn.Parameter(torch.randn(num_classes-1), requires_grad=False)
+        self.alt_multi_class = MulticlassValue(num_classes, ordered_p)
+    def forward(self, x):
+        # x has shape T, B, H
+        if len(x.shape) == 2:
+            return self.alt_multi_class(x)
+        T = 3
+        x[torch.isnan(x)] = 0.00001
+        d = torch.multinomial(torch.pow(0.00001+torch.sigmoid(x[:, :, 0:self.num_classes]).reshape(-1, self.num_classes), T), 1, replacement=True).reshape(x.shape[0], x.shape[1]).float()
+        return d
+class FlexibleCategorical(torch.nn.Module):
+    def __init__(self, get_batch, hyperparameters, args):
+        super(FlexibleCategorical, self).__init__()
+        self.h = {k: hyperparameters[k]() if callable(hyperparameters[k]) else hyperparameters[k] for k in
+                                hyperparameters.keys()}
+        self.args = args
+        self.args_passed = {**self.args}
+        self.args_passed.update({'num_features': self.h['num_features_used']})
+        self.get_batch = get_batch
+        if self.h['num_classes'] > 1 and not self.h['balanced']:
+            if self.h['multiclass_type'] == 'rank':
+                self.class_assigner = MulticlassRank(self.h['num_classes']
+                                             , ordered_p=self.h['output_multiclass_ordered_p']
+                                             )
+            elif self.h['multiclass_type'] == 'value':
+                self.class_assigner = MulticlassValue(self.h['num_classes']
+                                                     , ordered_p=self.h['output_multiclass_ordered_p']
+                                                     )
+            elif self.h['multiclass_type'] == 'multi_node':
+                self.class_assigner = MulticlassMultiNode(self.h['num_classes'])
+            else:
+                raise ValueError("Unknow Multiclass type")
+        elif self.h['num_classes'] == 2 and self.h['balanced']:
+            self.class_assigner = BalancedBinarize()
+        elif self.h['num_classes'] > 2 and self.h['balanced']:
+            raise NotImplementedError("Balanced multiclass training is not possible")
+        else:
+            self.class_assigner = lambda x:x # Regression
+    def drop_for_reason(self, x, v):
+        nan_prob_sampler = CategoricalActivation(ordered_p=0.0
+                                                 , categorical_p=1.0
+                                                 , keep_activation_size=False,
+                                                 num_classes_sampler=lambda: 20)
+        d = nan_prob_sampler(x)
+        # TODO: Make a different ordering for each activation
+        x[d < torch.rand((1,), device=x.device) * 20 * self.h['nan_prob_no_reason'] * random.random()] = v
+        return x
+    def drop_for_no_reason(self, x, v):
+        x[torch.rand(x.shape, device=self.args['device']) < self.h['nan_prob_no_reason']] = v
+        return x
+    def forward(self, batch_size):
+        start = time.time()
+        x, y, y_ = self.get_batch(hyperparameters=self.h, **self.args_passed)
+        if time_it:
+            print('Flex Forward Block 1', round(time.time() - start, 3))
+        start = time.time()
+        if self.h['nan_prob_no_reason']+self.h['nan_prob_a_reason']+self.h['nan_prob_unknown_reason'] > 0 and random.random() > 0.5: # Only one out of two datasets should have nans
+            if self.h['nan_prob_no_reason'] > 0 and random.random() > 0.5: # Missing for no reason
+                x = self.drop_for_no_reason(x, nan_handling_missing_for_no_reason_value(self.h['set_value_to_nan']))
+            if self.h['nan_prob_a_reason'] > 0 and random.random() > 0.5: # Missing for a reason
+                x = self.drop_for_reason(x, nan_handling_missing_for_a_reason_value(self.h['set_value_to_nan']))
+            if self.h['nan_prob_unknown_reason'] > 0: # Missing for unknown reason  and random.random() > 0.5
+                if random.random() < self.h['nan_prob_unknown_reason_reason_prior']:
+                    x = self.drop_for_no_reason(x, nan_handling_missing_for_unknown_reason_value(self.h['set_value_to_nan']))
+                else:
+                    x = self.drop_for_reason(x, nan_handling_missing_for_unknown_reason_value(self.h['set_value_to_nan']))
+        # Categorical features
+        if 'categorical_feature_p' in self.h and random.random() > 1 - self.h['categorical_feature_p']:
+            p = random.random()
+            for col in range(x.shape[2]):
+                m = MulticlassRank(10, ordered_p=0.3)
+                if random.random() > p:
+                    x[:, :, col] = m(x[:, :, col])
+        if time_it:
+            print('Flex Forward Block 2', round(time.time() - start, 3))
+            start = time.time()
+        if self.h['normalize_to_ranking']:
+            x = to_ranking_low_mem(x)
+        else:
+            x = remove_outliers(x)
+        x, y = normalize_data(x), normalize_data(y)
+        if time_it:
+            print('Flex Forward Block 3', round(time.time() - start, 3))
+            start = time.time()
+        # Cast to classification if enabled
+        y = self.class_assigner(y).float()
+        if time_it:
+            print('Flex Forward Block 4', round(time.time() - start, 3))
+            start = time.time()
+        if self.h['normalize_by_used_features']:
+            x = normalize_by_used_features_f(x, self.h['num_features_used'], self.args['num_features'], normalize_with_sqrt=self.h.get('normalize_with_sqrt',False))
+        if time_it:
+            print('Flex Forward Block 5', round(time.time() - start, 3))
+        start = time.time()
+        # Append empty features if enabled
+        x = torch.cat(
+            [x, torch.zeros((x.shape[0], x.shape[1], self.args['num_features'] - self.h['num_features_used']),
+                            device=self.args['device'])], -1)
+        if time_it:
+            print('Flex Forward Block 6', round(time.time() - start, 3))
+        return x, y, y  # x.shape = (T,B,H)
+import torch.cuda as cutorch
+@torch.no_grad()
+def get_batch(batch_size, seq_len, num_features, get_batch, device, hyperparameters=None, batch_size_per_gp_sample=None, **kwargs):
+    batch_size_per_gp_sample = batch_size_per_gp_sample or (min(32, batch_size))
+    num_models = batch_size // batch_size_per_gp_sample
+    assert num_models > 0, f'Batch size ({batch_size}) is too small for batch_size_per_gp_sample ({batch_size_per_gp_sample})'
+    assert num_models * batch_size_per_gp_sample == batch_size, f'Batch size ({batch_size}) not divisible by batch_size_per_gp_sample ({batch_size_per_gp_sample})'
+    # Sample one seq_len for entire batch
+    seq_len = hyperparameters['seq_len_used']() if callable(hyperparameters['seq_len_used']) else seq_len
+    args = {'device': device, 'seq_len': seq_len, 'num_features': num_features, 'batch_size': batch_size_per_gp_sample}
+    models = [FlexibleCategorical(get_batch, hyperparameters, args).to(device) for _ in range(num_models)]
+    start = time.time()
+    sample = sum([[model(batch_size=batch_size_per_gp_sample)] for model in models], [])
+    #print('sample', time.time() - start)
+    x, y, y_ = zip(*sample)
+    x, y, y_ = torch.cat(x, 1).detach(), torch.cat(y, 1).detach(), torch.cat(y_, 1).detach()
+    # # TODO: Reintegrate this code (Doesn't work on batch dim), could be applied to each batch sample individually
+    # if hyperparameters['is_binary_classification'] and hyperparameters['order_y']:
+    #     x, y = order_by_y(x, y)
+    return x, y, y_
+# num_features_used = num_features_used_sampler()
+# prior_outputscale = prior_outputscale_sampler()
+# prior_lengthscale = prior_lengthscale_sampler()
+#
+# x, sample = normalize_data(x), normalize_data(sample)
+#
+# if is_binary_classification:
+#     sample = (sample > torch.median(sample, dim=0)[0]).float()
+#
+# if normalize_by_used_features:
+#     x = normalize_by_used_features_f(x, num_features_used, num_features)
+#
+# # # if is_binary_classification and order_y:
+# # #     x, sample = order_by_y(x, sample)
+# #
+# # Append empty features if enabled
+# x = torch.cat([x, torch.zeros((x.shape[0], x.shape[1], num_features - num_features_used), device=device)], -1)
+DataLoader = get_batch_to_dataloader(get_batch)
+DataLoader.num_outputs = 1

TabPFN/priors/mlp.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import random
+import math
+import torch
+from torch import nn
+import numpy as np
+from utils import default_device
+from .utils import get_batch_to_dataloader
+class GaussianNoise(nn.Module):
+    def __init__(self, std, device):
+        super().__init__()
+        self.std = std
+        self.device=device
+    def forward(self, x):
+        return x + torch.normal(torch.zeros_like(x), self.std)
+def causes_sampler_f(num_causes):
+    means = np.random.normal(0, 1, (num_causes))
+    std = np.abs(np.random.normal(0, 1, (num_causes)) * means)
+    return means, std
+def get_batch(batch_size, seq_len, num_features, hyperparameters, device=default_device, num_outputs=1, sampling='normal', **kwargs):
+    if ('mix_activations' in hyperparameters) and hyperparameters['mix_activations']:
+        s = hyperparameters['prior_mlp_activations']()
+        hyperparameters['prior_mlp_activations'] = lambda : s
+    class MLP(torch.nn.Module):
+        def __init__(self, hyperparameters):
+            super(MLP, self).__init__()
+            with torch.no_grad():
+                for key in hyperparameters:
+                    setattr(self, key, hyperparameters[key])
+                assert (self.num_layers >= 2)
+                if 'verbose' in hyperparameters and self.verbose:
+                    print({k : hyperparameters[k] for k in ['is_causal', 'num_causes', 'prior_mlp_hidden_dim'
+                        , 'num_layers', 'noise_std', 'y_is_effect', 'pre_sample_weights', 'prior_mlp_dropout_prob'
+                        , 'pre_sample_causes']})
+                if self.is_causal:
+                    self.prior_mlp_hidden_dim = max(self.prior_mlp_hidden_dim, num_outputs + 2 * num_features)
+                else:
+                    self.num_causes = num_features
+                # This means that the mean and standard deviation of each cause is determined in advance
+                if self.pre_sample_causes:
+                    self.causes_mean, self.causes_std = causes_sampler_f(self.num_causes)
+                    self.causes_mean = torch.tensor(self.causes_mean, device=device).unsqueeze(0).unsqueeze(0).tile(
+                        (seq_len, 1, 1))
+                    self.causes_std = torch.tensor(self.causes_std, device=device).unsqueeze(0).unsqueeze(0).tile(
+                        (seq_len, 1, 1))
+                def generate_module(layer_idx, out_dim):
+                    # Determine std of each noise term in initialization, so that is shared in runs
+                    # torch.abs(torch.normal(torch.zeros((out_dim)), self.noise_std)) - Change std for each dimension?
+                    noise = (GaussianNoise(torch.abs(torch.normal(torch.zeros(size=(1, out_dim), device=device), float(self.noise_std))), device=device)
+                         if self.pre_sample_weights else GaussianNoise(float(self.noise_std), device=device))
+                    return [
+                        nn.Sequential(*[self.prior_mlp_activations()
+                            , nn.Linear(self.prior_mlp_hidden_dim, out_dim)
+                            , noise])
+                    ]
+                self.layers = [nn.Linear(self.num_causes, self.prior_mlp_hidden_dim, device=device)]
+                self.layers += [module for layer_idx in range(self.num_layers-1) for module in generate_module(layer_idx, self.prior_mlp_hidden_dim)]
+                if not self.is_causal:
+                    self.layers += generate_module(-1, num_outputs)
+                self.layers = nn.Sequential(*self.layers)
+                # Initialize Model parameters
+                for i, (n, p) in enumerate(self.layers.named_parameters()):
+                    if self.block_wise_dropout:
+                        if len(p.shape) == 2: # Only apply to weight matrices and not bias
+                            nn.init.zeros_(p)
+                            # TODO: N blocks should be a setting
+                            n_blocks = random.randint(1, math.ceil(math.sqrt(min(p.shape[0], p.shape[1]))))
+                            w, h = p.shape[0] // n_blocks, p.shape[1] // n_blocks
+                            keep_prob = (n_blocks*w*h) / p.numel()
+                            for block in range(0, n_blocks):
+                                nn.init.normal_(p[w * block: w * (block+1), h * block: h * (block+1)], std=self.init_std / keep_prob**(1/2))
+                    else:
+                        if len(p.shape) == 2: # Only apply to weight matrices and not bias
+                            dropout_prob = self.prior_mlp_dropout_prob if i > 0 else 0.0  # Don't apply dropout in first layer
+                            dropout_prob = min(dropout_prob, 0.99)
+                            nn.init.normal_(p, std=self.init_std / (1. - dropout_prob)**(1/2))
+                            p *= torch.bernoulli(torch.zeros_like(p) + 1. - dropout_prob)
+        def forward(self):
+            def sample_normal():
+                if self.pre_sample_causes:
+                    causes = torch.normal(self.causes_mean, self.causes_std.abs()).float()
+                else:
+                    causes = torch.normal(0., 1., (seq_len, 1, self.num_causes), device=device).float()
+                return causes
+            if self.sampling == 'normal':
+                causes = sample_normal()
+            elif self.sampling == 'mixed':
+                zipf_p, multi_p, normal_p = random.random() * 0.66, random.random() * 0.66, random.random() * 0.66
+                def sample_cause(n):
+                    if random.random() > normal_p:
+                        if self.pre_sample_causes:
+                            return torch.normal(self.causes_mean[:, :, n], self.causes_std[:, :, n].abs()).float()
+                        else:
+                            return torch.normal(0., 1., (seq_len, 1), device=device).float()
+                    elif random.random() > multi_p:
+                        x = torch.multinomial(torch.rand((random.randint(2, 10))), seq_len, replacement=True).to(device).unsqueeze(-1).float()
+                        x = (x - torch.mean(x)) / torch.std(x)
+                        return x
+                    else:
+                        x = torch.minimum(torch.tensor(np.random.zipf(2.0 + random.random() * 2, size=(seq_len)),
+                                            device=device).unsqueeze(-1).float(), torch.tensor(10.0, device=device))
+                        return x - torch.mean(x)
+                causes = torch.cat([sample_cause(n).unsqueeze(-1) for n in range(self.num_causes)], -1)
+            elif self.sampling == 'uniform':
+                causes = torch.rand((seq_len, 1, self.num_causes), device=device)
+            else:
+                raise ValueError(f'Sampling is set to invalid setting: {sampling}.')
+            outputs = [causes]
+            for layer in self.layers:
+                outputs.append(layer(outputs[-1]))
+            outputs = outputs[2:]
+            if self.is_causal:
+                ## Sample nodes from graph if model is causal
+                outputs_flat = torch.cat(outputs, -1)
+                if self.in_clique:
+                    random_perm = random.randint(0, outputs_flat.shape[-1] - num_outputs - num_features) + torch.randperm(num_outputs + num_features, device=device)
+                else:
+                    random_perm = torch.randperm(outputs_flat.shape[-1]-1, device=device)
+                random_idx_y = list(range(-num_outputs, -0)) if self.y_is_effect else random_perm[0:num_outputs]
+                random_idx = random_perm[num_outputs:num_outputs + num_features]
+                if self.sort_features:
+                    random_idx, _ = torch.sort(random_idx)
+                y = outputs_flat[:, :, random_idx_y]
+                x = outputs_flat[:, :, random_idx]
+            else:
+                y = outputs[-1][:, :, :]
+                x = causes
+            if bool(torch.any(torch.isnan(x)).detach().cpu().numpy()) or bool(torch.any(torch.isnan(y)).detach().cpu().numpy()):
+                x[:] = 0.0
+                y[:] = 1.0
+            return x, y
+    model = MLP(hyperparameters).to(device)
+    sample = sum([[model()] for _ in range(0, batch_size)], [])
+    x, y = zip(*sample)
+    y = torch.cat(y, 1).detach().squeeze(2)
+    x = torch.cat(x, 1).detach()
+    x = x[..., torch.randperm(x.shape[-1])]
+    return x, y, y
+DataLoader = get_batch_to_dataloader(get_batch)
+DataLoader.num_outputs = 1

TabPFN/priors/prior.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from torch.utils.data import DataLoader
+class PriorDataLoader(DataLoader):
+    pass
+    # init accepts num_steps as first argument
+    # has two attributes set on class or object level:
+    # num_features: int and
+    # num_outputs: int
+    # fuse_x_y: bool
+    # Optional: validate function that accepts a transformer model

TabPFN/priors/prior_bag.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from .utils import get_batch_to_dataloader
+from utils import default_device
+def get_batch(batch_size, seq_len, num_features, device=default_device
+              , hyperparameters=None, batch_size_per_gp_sample=None, **kwargs):
+    batch_size_per_gp_sample = batch_size_per_gp_sample or (min(64, batch_size))
+    num_models = batch_size // batch_size_per_gp_sample
+    assert num_models * batch_size_per_gp_sample == batch_size, f'Batch size ({batch_size}) not divisible by batch_size_per_gp_sample ({batch_size_per_gp_sample})'
+    args = {'device': device, 'seq_len': seq_len, 'num_features': num_features, 'batch_size': batch_size_per_gp_sample}
+    prior_bag_priors_get_batch = hyperparameters['prior_bag_get_batch']
+    prior_bag_priors_p = [1.0] + [hyperparameters[f'prior_bag_exp_weights_{i}'] for i in range(1, len(prior_bag_priors_get_batch))]
+    weights = torch.tensor(prior_bag_priors_p, dtype=torch.float)  # create a tensor of weights
+    batch_assignments = torch.multinomial(torch.softmax(weights, 0), num_models, replacement=True).numpy()
+    if 'verbose' in hyperparameters and hyperparameters['verbose']:
+        print('PRIOR_BAG:', weights, batch_assignments)
+    sample = sum([[prior_bag_priors_get_batch[int(prior_idx)](hyperparameters=hyperparameters, **args)] for prior_idx in batch_assignments], [])
+    x, y, y_ = zip(*sample)
+    x, y, y_ = (torch.cat(x, 1).detach()
+                                        , torch.cat(y, 1).detach()
+                                        , torch.cat(y_, 1).detach())
+    return x, y, y_
+DataLoader = get_batch_to_dataloader(get_batch)
+DataLoader.num_outputs = 1

TabPFN/priors/utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import random
+import torch
+from utils import set_locals_in_self
+from .prior import PriorDataLoader
+from torch import nn
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import scipy.stats as stats
+import math
+def get_batch_to_dataloader(get_batch_method_):
+    class DL(PriorDataLoader):
+        get_batch_method = get_batch_method_
+        # Caution, you might need to set self.num_features manually if it is not part of the args.
+        def __init__(self, num_steps, fuse_x_y=False, **get_batch_kwargs):
+            set_locals_in_self(locals())
+            # The stuff outside the or is set as class attribute before instantiation.
+            self.num_features = get_batch_kwargs.get('num_features') or self.num_features
+            self.num_outputs = get_batch_kwargs.get('num_outputs') or self.num_outputs
+            print('DataLoader.__dict__', self.__dict__)
+        @staticmethod
+        def gbm(*args, fuse_x_y=True, **kwargs):
+            dynamic_seq_len = callable(kwargs['seq_len'])
+            kwargs['seq_len'] = kwargs['seq_len']() if dynamic_seq_len else kwargs['seq_len']
+            # Scales the batch size dynamically with the power of 'dynamic_batch_size'.
+            # A transformer with quadratic memory usage in the seq len would need a power of 2 to keep memory constant.
+            if dynamic_seq_len and 'dynamic_batch_size' in kwargs and kwargs['dynamic_batch_size'] > 0:
+                kwargs['batch_size'] = kwargs['batch_size'] * math.floor(math.pow(kwargs['seq_len_maximum'], kwargs['dynamic_batch_size']) / math.pow(kwargs['seq_len'], kwargs['dynamic_batch_size']))
+            batch = get_batch_method_(*args, **kwargs)
+            x, y, target_y, style = batch if len(batch) == 4 else (batch[0], batch[1], batch[2], None)
+            if fuse_x_y:
+                return torch.cat([x, torch.cat([torch.zeros_like(y[:1]), y[:-1]], 0).unsqueeze(-1).float()],
+                                 -1), target_y
+            else:
+                return (style, x, y), target_y
+        def __len__(self):
+            return self.num_steps
+        def __iter__(self):
+            return iter(self.gbm(**self.get_batch_kwargs, fuse_x_y=self.fuse_x_y) for _ in range(self.num_steps))
+    return DL
+import seaborn as sns
+def plot_features(data, targets, fig=None):
+    if torch.is_tensor(data):
+        data = data.detach().cpu().numpy()
+        targets = targets.detach().cpu().numpy()
+    #data = np.concatenate([data, data[:, -1:]], -1)
+    #df = pd.DataFrame(data, columns=list(range(0, data.shape[1])))
+    #g = sns.pairplot(df, hue=data.shape[1]-1, palette="Set2", diag_kind="kde", height=2.5)
+    #plt.legend([], [], frameon=False)
+    #g._legend.remove()
+    #g = sns.PairGrid(df, hue=data.shape[1]-1)
+    #g.map_diag(sns.histplot)
+    #g.map_offdiag(sns.scatterplot)
+    #g._legend.remove()
+    fig2 = fig if fig else plt.figure(figsize=(8, 8))
+    spec2 = gridspec.GridSpec(ncols=data.shape[1], nrows=data.shape[1], figure=fig2)
+    for d in range(0, data.shape[1]):
+        for d2 in range(0, data.shape[1]):
+            sub_ax = fig2.add_subplot(spec2[d, d2])
+            if d == d2:
+                sns.kdeplot(data[:, d],hue=targets[:],ax=sub_ax,legend=False, palette="deep")
+                sub_ax.set(ylabel=None)
+            else:
+                sns.scatterplot(x=data[:, d], y=data[:, d2],
+                           hue=targets[:],legend=False, palette="deep")
+                #plt.scatter(data[:, d], data[:, d2],
+                #               c=targets[:])
+            sub_ax.get_xaxis().set_ticks([])
+            sub_ax.get_yaxis().set_ticks([])
+    plt.subplots_adjust(wspace=0.05, hspace=0.05)
+    fig2.show()
+def plot_prior(prior):
+    s = np.array([prior() for _ in range(0, 1000)])
+    count, bins, ignored = plt.hist(s, 50, density=True)
+    print(s.min())
+    plt.show()
+trunc_norm_sampler_f = lambda mu, sigma : lambda: stats.truncnorm((0 - mu) / sigma, (1000000 - mu) / sigma, loc=mu, scale=sigma).rvs(1)[0]
+beta_sampler_f = lambda a, b : lambda : np.random.beta(a, b)
+gamma_sampler_f = lambda a, b : lambda : np.random.gamma(a, b)
+uniform_sampler_f = lambda a, b : lambda : np.random.uniform(a, b)
+uniform_int_sampler_f = lambda a, b : lambda : round(np.random.uniform(a, b))
+def zipf_sampler_f(a, b, c):
+    x = np.arange(b, c)
+    weights = x ** (-a)
+    weights /= weights.sum()
+    return lambda : stats.rv_discrete(name='bounded_zipf', values=(x, weights)).rvs(1)
+scaled_beta_sampler_f = lambda a, b, scale, minimum : lambda : minimum + round(beta_sampler_f(a, b)() * (scale - minimum))
+def normalize_by_used_features_f(x, num_features_used, num_features, normalize_with_sqrt=False):
+    if normalize_with_sqrt:
+        return x / (num_features_used / num_features)**(1 / 2)
+    return x / (num_features_used / num_features)
+def order_by_y(x, y):
+    order = torch.argsort(y if random.randint(0, 1) else -y, dim=0)[:, 0, 0]
+    order = order.reshape(2, -1).transpose(0, 1).reshape(-1)#.reshape(seq_len)
+    x = x[order]  # .reshape(2, -1).transpose(0, 1).reshape(-1).flip([0]).reshape(seq_len, 1, -1)
+    y = y[order]  # .reshape(2, -1).transpose(0, 1).reshape(-1).reshape(seq_len, 1, -1)
+    return x, y
+def randomize_classes(x, num_classes):
+    classes = torch.arange(0, num_classes, device=x.device)
+    random_classes = torch.randperm(num_classes, device=x.device).type(x.type())
+    x = ((x.unsqueeze(-1) == classes) * random_classes).sum(-1)
+    return x
+class CategoricalActivation(nn.Module):
+    def __init__(self, categorical_p=0.1, ordered_p=0.7
+                 , keep_activation_size=False
+                 , num_classes_sampler=zipf_sampler_f(0.8, 1, 10)):
+        self.categorical_p = categorical_p
+        self.ordered_p = ordered_p
+        self.keep_activation_size = keep_activation_size
+        self.num_classes_sampler = num_classes_sampler
+        super().__init__()
+    def forward(self, x):
+        # x shape: T, B, H
+        x = nn.Softsign()(x)
+        num_classes = self.num_classes_sampler()
+        hid_strength = torch.abs(x).mean(0).unsqueeze(0) if self.keep_activation_size else None
+        categorical_classes = torch.rand((x.shape[1], x.shape[2])) < self.categorical_p
+        class_boundaries = torch.zeros((num_classes - 1, x.shape[1], x.shape[2]), device=x.device, dtype=x.dtype)
+        # Sample a different index for each hidden dimension, but shared for all batches
+        for b in range(x.shape[1]):
+            for h in range(x.shape[2]):
+                ind = torch.randint(0, x.shape[0], (num_classes - 1,))
+                class_boundaries[:, b, h] = x[ind, b, h]
+        for b in range(x.shape[1]):
+            x_rel = x[:, b, categorical_classes[b]]
+            boundaries_rel = class_boundaries[:, b, categorical_classes[b]].unsqueeze(1)
+            x[:, b, categorical_classes[b]] = (x_rel > boundaries_rel).sum(dim=0).float() - num_classes / 2
+        ordered_classes = torch.rand((x.shape[1],x.shape[2])) < self.ordered_p
+        ordered_classes = torch.logical_and(ordered_classes, categorical_classes)
+        x[:, ordered_classes] = randomize_classes(x[:, ordered_classes], num_classes)
+        x = x * hid_strength if self.keep_activation_size else x
+        return x

TabPFN/requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# Please use python V 3.7 to be compatible with all packages
+gpytorch==1.5.0
+torch==1.9.0
+scikit-learn==0.24.2
+pyyaml==5.4.1
+seaborn==0.11.2
+xgboost==1.4.0
+tqdm==4.62.1
+numpy==1.21.2
+openml==0.12.2
+catboost==0.26.1
+auto-sklearn==0.14.5
+hyperopt==0.2.5
+configspace==0.4.21
+# autogluon==0.4.0

TabPFN/scripts/__pycache__/tabular_baselines.cpython-39.pyc ADDED Viewed

Binary file (11.3 kB). View file