{ "cells": [ { "cell_type": "markdown", "id": "3c142580", "metadata": {}, "source": [ "
\n", "

Titanic Dataset

\n", "

Make final model and use it in Streamlit

\n", "

Soheil Tehranipour

\n", "
" ] }, { "cell_type": "code", "execution_count": null, "id": "405e33d5", "metadata": { "tags": [] }, "outputs": [], "source": [ "pip install xgboost" ] }, { "cell_type": "code", "execution_count": 2, "id": "97f5a509", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'xgboost'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[2], line 9\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompose\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ColumnTransformer , make_column_transformer\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, TransformerMixin\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxgboost\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m XGBClassifier\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m confusion_matrix, precision_score, recall_score\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'" ] } ], "source": [ "import pandas as pd\n", "import re\n", "import numpy as np\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer , make_column_transformer\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from xgboost import XGBClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix, precision_score, recall_score\n", "import warnings\n", "warnings. filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": null, "id": "c73e3ee5", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('train.csv')\n", "X = df.drop('Survived', axis=1) \n", "y = df.Survived" ] }, { "cell_type": "code", "execution_count": null, "id": "ea854483", "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "e82c900f", "metadata": {}, "outputs": [], "source": [ "# The BaseEstimator just gives it the get_params and set_params methods that all Scikit-learn estimators require. \n", "#The TransformerMixin gives it the fit_transform method.\n", "\n", "class PrepProcesor(BaseEstimator, TransformerMixin): \n", " def fit(self, X, y=None): \n", " self.ageImputer = SimpleImputer()\n", " self.ageImputer.fit(X[['Age']]) \n", " return self \n", " \n", " def transform(self, X, y=None):\n", " X['Age'] = self.ageImputer.transform(X[['Age']])\n", " X['CabinClass'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(\" \", \"\")).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))\n", " X['CabinNumber'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(\" \", \"\")).apply(lambda x: re.sub(r'[^0-9]', '', x)).replace('', 0) \n", " X['Embarked'] = X['Embarked'].fillna('M')\n", " X = X.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)\n", " return X" ] }, { "cell_type": "code", "execution_count": null, "id": "6c89feab", "metadata": {}, "outputs": [], "source": [ "preproc = PrepProcesor()\n", "numeric_pipeline = Pipeline([('Scaler', StandardScaler())]) \n", "categorical_pipeline = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])\n", "transformer = ColumnTransformer([('num', numeric_pipeline, ['Pclass','Age','SibSp','Parch','Fare','CabinNumber']), ('cat', categorical_pipeline, ['Sex','Embarked','CabinClass'])])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "61d620be", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10, random_state=1234) " ] }, { "cell_type": "code", "execution_count": null, "id": "6a10b0a7", "metadata": {}, "outputs": [], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": null, "id": "db5de807", "metadata": {}, "outputs": [], "source": [ "mlpipe = Pipeline([('InitialPreproc', PrepProcesor()), ('Transformer',transformer), ('xgb', XGBClassifier())])" ] }, { "cell_type": "code", "execution_count": null, "id": "ec852a78", "metadata": { "tags": [] }, "outputs": [], "source": [ "mlpipe.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "b3c048d1", "metadata": {}, "outputs": [], "source": [ "y_hat = mlpipe.predict(X_test) " ] }, { "cell_type": "code", "execution_count": null, "id": "cc40818e", "metadata": {}, "outputs": [], "source": [ "y_hat.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "9dee0c0e", "metadata": {}, "outputs": [], "source": [ "y_test" ] }, { "cell_type": "code", "execution_count": null, "id": "a914cc2e", "metadata": {}, "outputs": [], "source": [ "precision_score(y_test, y_hat) " ] }, { "cell_type": "code", "execution_count": null, "id": "cf0270c0", "metadata": {}, "outputs": [], "source": [ "import joblib" ] }, { "cell_type": "code", "execution_count": null, "id": "cd7011a1", "metadata": {}, "outputs": [], "source": [ "joblib.dump(mlpipe, 'xgbpipe.joblib') " ] }, { "cell_type": "code", "execution_count": null, "id": "b3ea7174", "metadata": { "tags": [] }, "outputs": [], "source": [ "model = joblib.load('xgbpipe.joblib')" ] }, { "cell_type": "code", "execution_count": null, "id": "60f0d045", "metadata": {}, "outputs": [], "source": [ "test = pd.read_csv('test.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "b46ac674", "metadata": {}, "outputs": [], "source": [ "test.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "c2ce6aec", "metadata": { "scrolled": true, "tags": [] }, "outputs": [], "source": [ "yhat = model.predict(test)\n", "yhat" ] }, { "cell_type": "markdown", "id": "92ee0e6a", "metadata": {}, "source": [ "" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }