diff --git a/Schrick-Noah_Learning-Practice-5.ipynb b/Schrick-Noah_Learning-Practice-5.ipynb index f62af62..7a1f744 100644 --- a/Schrick-Noah_Learning-Practice-5.ipynb +++ b/Schrick-Noah_Learning-Practice-5.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 24, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -19,7 +19,10 @@ "from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n", "import statsmodels.formula.api as sm\n", "import matplotlib.pylab as plt\n", - "import seaborn as sns\n" + "import seaborn as sns\n", + "from dmba import regressionSummary, exhaustive_search\n", + "from dmba import backward_elimination, forward_selection, stepwise_selection\n", + "from dmba import adjusted_r2_score, AIC_score, BIC_score\n" ] }, { @@ -74,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -85,7 +88,15 @@ " Predictor coefficient\n", "0 CRIM -0.240062\n", "1 CHAS 3.266817\n", - "2 RM 8.325175\n" + "2 RM 8.325175\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : -0.0000\n", + " Root Mean Squared Error (RMSE) : 5.9666\n", + " Mean Absolute Error (MAE) : 3.9668\n", + " Mean Percentage Error (MPE) : -7.2747\n", + "Mean Absolute Percentage Error (MAPE) : 22.5927\n" ] } ], @@ -108,6 +119,9 @@ "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))\n", + "\n", "# Equation:\n", "# MEDV = -29.19 -0.24*CRIM + 3.27*CHAS + 8.33*RM\n" ] @@ -190,22 +204,304 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# d.\n", - "# ii." + "# ii.\n", + "sns.heatmap(corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Highly Correlated Pairs\n", + "ZN and DIS\n", + "RAD and TAX\n", + "PTRATIO and RAD\n", + "PTRATIO and TAX" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Backward\n", + "Variables: CRIM, CHAS, RM\n", + "Start: score=1952.30\n", + "Step: score=1952.30, remove None\n", + "['CRIM', 'CHAS', 'RM']\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : -0.0000\n", + " Root Mean Squared Error (RMSE) : 5.9666\n", + " Mean Absolute Error (MAE) : 3.9668\n", + " Mean Percentage Error (MPE) : -7.2747\n", + "Mean Absolute Percentage Error (MAPE) : 22.5927\n", + "\n" + ] + } + ], "source": [ "# d.\n", - "# iii." + "# iii.\n", + "def train_model(variables):\n", + " if len(variables) == 0:\n", + " return None\n", + " model = LinearRegression()\n", + " model.fit(train_X[variables], train_y)\n", + " return model\n", + "\n", + "def score_model(model, variables):\n", + " if len(variables) == 0:\n", + " return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n", + " return AIC_score(train_y, model.predict(train_X[variables]), model)\n", + "\n", + "print(\"Backward\")\n", + "best_back_model, best_back_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n", + "print(best_back_variables)\n", + "regressionSummary(train_y, best_back_model.predict(train_X))\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Forward\n", + "Variables: CRIM, CHAS, RM\n", + "Start: score=2191.75, constant\n", + "Step: score=1989.28, add RM\n", + "Step: score=1956.79, add CRIM\n", + "Step: score=1952.30, add CHAS\n", + "Step: score=1952.30, add None\n", + "['RM', 'CRIM', 'CHAS']\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : -0.0000\n", + " Root Mean Squared Error (RMSE) : 5.9666\n", + " Mean Absolute Error (MAE) : 3.9668\n", + " Mean Percentage Error (MPE) : -7.2747\n", + "Mean Absolute Percentage Error (MAPE) : 22.5927\n" + ] + } + ], + "source": [ + "print(\"Forward\")\n", + "best_forw_model, best_forw_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n", + "print(best_forw_variables)\n", + "forw_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n", + "regressionSummary(train_y, best_forw_model.predict(forw_train_X))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stepwise\n", + "Variables: CRIM, CHAS, RM\n", + "Start: score=2191.75, constant\n", + "Step: score=1989.28, add RM\n", + "Step: score=1956.79, add CRIM\n", + "Step: score=1952.30, add CHAS\n", + "Step: score=1952.30, add None\n", + "['RM', 'CRIM', 'CHAS']\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : -0.0000\n", + " Root Mean Squared Error (RMSE) : 5.9666\n", + " Mean Absolute Error (MAE) : 3.9668\n", + " Mean Percentage Error (MPE) : -7.2747\n", + "Mean Absolute Percentage Error (MAPE) : 22.5927\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : -0.0000\n", + " Root Mean Squared Error (RMSE) : 5.9666\n", + " Mean Absolute Error (MAE) : 3.9668\n", + " Mean Percentage Error (MPE) : -7.2747\n", + "Mean Absolute Percentage Error (MAPE) : 22.5927\n" + ] + } + ], + "source": [ + "# d iii. continued\n", + "print(\"Stepwise\")\n", + "best_step_model, best_step_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n", + "print(best_step_variables)\n", + "step_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n", + "regressionSummary(train_y, best_step_model.predict(step_train_X))\n", + "test=regressionSummary(train_y, best_step_model.predict(step_train_X))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LASSO\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 0.2627\n", + " Root Mean Squared Error (RMSE) : 6.7153\n", + " Mean Absolute Error (MAE) : 4.7355\n", + " Mean Percentage Error (MPE) : -8.5983\n", + "Mean Absolute Percentage Error (MAPE) : 23.9824\n", + "\n", + "\n", + "LASSO CV\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 0.1124\n", + " Root Mean Squared Error (RMSE) : 6.4186\n", + " Mean Absolute Error (MAE) : 4.4592\n", + " Mean Percentage Error (MPE) : -7.7091\n", + "Mean Absolute Percentage Error (MAPE) : 23.1854\n", + "Lasso-CV chosen regularization: 0.033515828458353755\n", + "[-0.24201538 2.81692528 8.25934245]\n", + "\n", + "\n", + "RIDGE\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 0.1201\n", + " Root Mean Squared Error (RMSE) : 6.4138\n", + " Mean Absolute Error (MAE) : 4.4590\n", + " Mean Percentage Error (MPE) : -7.6484\n", + "Mean Absolute Percentage Error (MAPE) : 23.1724\n", + "\n", + "\n", + "BAYESIAN RIDGE\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 0.1211\n", + " Root Mean Squared Error (RMSE) : 6.4144\n", + " Mean Absolute Error (MAE) : 4.4603\n", + " Mean Percentage Error (MPE) : -7.6595\n", + "Mean Absolute Percentage Error (MAPE) : 23.1747\n", + "Bayesian ridge chosen regularization: 1.3591395967339095\n", + "\n", + "\n" + ] + } + ], + "source": [ + "# d iii model\n", + "print(\"LASSO\")\n", + "lasso = Lasso(alpha=1)\n", + "lasso.fit(train_X, train_y)\n", + "regressionSummary(valid_y, lasso.predict(valid_X))\n", + "print(\"\\n\")\n", + "\n", + "print(\"LASSO CV\")\n", + "lasso_cv = LassoCV(cv=5)\n", + "lasso_cv.fit(train_X, train_y)\n", + "regressionSummary(valid_y, lasso_cv.predict(valid_X))\n", + "print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n", + "print(lasso_cv.coef_)\n", + "print(\"\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RIDGE\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 0.1201\n", + " Root Mean Squared Error (RMSE) : 6.4138\n", + " Mean Absolute Error (MAE) : 4.4590\n", + " Mean Percentage Error (MPE) : -7.6484\n", + "Mean Absolute Percentage Error (MAPE) : 23.1724\n", + "\n", + "\n", + "BAYESIAN RIDGE\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 0.1211\n", + " Root Mean Squared Error (RMSE) : 6.4144\n", + " Mean Absolute Error (MAE) : 4.4603\n", + " Mean Percentage Error (MPE) : -7.6595\n", + "Mean Absolute Percentage Error (MAPE) : 23.1747\n", + "Bayesian ridge chosen regularization: 1.3591395967339095\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"RIDGE\")\n", + "ridge = Ridge(alpha=1)\n", + "ridge.fit(train_X, train_y)\n", + "regressionSummary(valid_y, ridge.predict(valid_X))\n", + "print(\"\\n\")\n", + "\n", + "print(\"BAYESIAN RIDGE\")\n", + "bayesianRidge = BayesianRidge()\n", + "bayesianRidge.fit(train_X, train_y)\n", + "regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n", + "print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)\n", + "print(\"\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Best model\n", + "Bayesian Ridge: Lowest MAPE\n", + "Ridge: Lowest RMSE, lowest MAE\n", + "\n", + "Ridge or Bayesian Ridge should be used. Further parameter tuning can assist in selection which of the two models to use." ] }, {