Predictor exploration, subset selection, and determining models

2023-02-27 15:33:04 -06:00 · 2023-02-27 15:33:04 -06:00 · 0f2d8787ca
commit 0f2d8787ca
parent b5e5908f6e
1 changed files with 306 additions and 10 deletions
--- a/Schrick-Noah_Learning-Practice-5.ipynb
+++ b/Schrick-Noah_Learning-Practice-5.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
@ -19,7 +19,10 @@
    "from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
    "import statsmodels.formula.api as sm\n",
    "import matplotlib.pylab as plt\n",
-    "import seaborn as sns\n"
+    "import seaborn as sns\n",
+    "from dmba import regressionSummary, exhaustive_search\n",
+    "from dmba import backward_elimination, forward_selection, stepwise_selection\n",
+    "from dmba import adjusted_r2_score, AIC_score, BIC_score\n"
   ]
  },
  {
@ -74,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
@ -85,7 +88,15 @@
      "  Predictor  coefficient\n",
      "0      CRIM    -0.240062\n",
      "1      CHAS     3.266817\n",
-      "2        RM     8.325175\n"
+      "2        RM     8.325175\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : -0.0000\n",
+      "       Root Mean Squared Error (RMSE) : 5.9666\n",
+      "            Mean Absolute Error (MAE) : 3.9668\n",
+      "          Mean Percentage Error (MPE) : -7.2747\n",
+      "Mean Absolute Percentage Error (MAPE) : 22.5927\n"
     ]
    }
   ],
@ -108,6 +119,9 @@
    "print('intercept ', housing_lm.intercept_)\n",
    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
    "\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))\n",
+    "\n",
    "# Equation:\n",
    "# MEDV = -29.19 -0.24*CRIM + 3.27*CHAS + 8.33*RM\n"
   ]
@ -190,22 +204,304 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<AxesSubplot: >"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "# d.\n",
-    "# ii."
+    "# ii.\n",
+    "sns.heatmap(corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Highly Correlated Pairs\n",
+    "ZN and DIS\n",
+    "RAD and TAX\n",
+    "PTRATIO and RAD\n",
+    "PTRATIO and TAX"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 42,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Backward\n",
+      "Variables: CRIM, CHAS, RM\n",
+      "Start: score=1952.30\n",
+      "Step: score=1952.30, remove None\n",
+      "['CRIM', 'CHAS', 'RM']\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : -0.0000\n",
+      "       Root Mean Squared Error (RMSE) : 5.9666\n",
+      "            Mean Absolute Error (MAE) : 3.9668\n",
+      "          Mean Percentage Error (MPE) : -7.2747\n",
+      "Mean Absolute Percentage Error (MAPE) : 22.5927\n",
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "# d.\n",
-    "# iii."
+    "# iii.\n",
+    "def train_model(variables):\n",
+    "    if len(variables) == 0:\n",
+    "        return None\n",
+    "    model = LinearRegression()\n",
+    "    model.fit(train_X[variables], train_y)\n",
+    "    return model\n",
+    "\n",
+    "def score_model(model, variables):\n",
+    "    if len(variables) == 0:\n",
+    "        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n",
+    "    return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
+    "\n",
+    "print(\"Backward\")\n",
+    "best_back_model, best_back_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n",
+    "print(best_back_variables)\n",
+    "regressionSummary(train_y, best_back_model.predict(train_X))\n",
+    "print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Forward\n",
+      "Variables: CRIM, CHAS, RM\n",
+      "Start: score=2191.75, constant\n",
+      "Step: score=1989.28, add RM\n",
+      "Step: score=1956.79, add CRIM\n",
+      "Step: score=1952.30, add CHAS\n",
+      "Step: score=1952.30, add None\n",
+      "['RM', 'CRIM', 'CHAS']\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : -0.0000\n",
+      "       Root Mean Squared Error (RMSE) : 5.9666\n",
+      "            Mean Absolute Error (MAE) : 3.9668\n",
+      "          Mean Percentage Error (MPE) : -7.2747\n",
+      "Mean Absolute Percentage Error (MAPE) : 22.5927\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Forward\")\n",
+    "best_forw_model, best_forw_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
+    "print(best_forw_variables)\n",
+    "forw_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
+    "regressionSummary(train_y, best_forw_model.predict(forw_train_X))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Stepwise\n",
+      "Variables: CRIM, CHAS, RM\n",
+      "Start: score=2191.75, constant\n",
+      "Step: score=1989.28, add RM\n",
+      "Step: score=1956.79, add CRIM\n",
+      "Step: score=1952.30, add CHAS\n",
+      "Step: score=1952.30, add None\n",
+      "['RM', 'CRIM', 'CHAS']\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : -0.0000\n",
+      "       Root Mean Squared Error (RMSE) : 5.9666\n",
+      "            Mean Absolute Error (MAE) : 3.9668\n",
+      "          Mean Percentage Error (MPE) : -7.2747\n",
+      "Mean Absolute Percentage Error (MAPE) : 22.5927\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : -0.0000\n",
+      "       Root Mean Squared Error (RMSE) : 5.9666\n",
+      "            Mean Absolute Error (MAE) : 3.9668\n",
+      "          Mean Percentage Error (MPE) : -7.2747\n",
+      "Mean Absolute Percentage Error (MAPE) : 22.5927\n"
+     ]
+    }
+   ],
+   "source": [
+    "# d iii. continued\n",
+    "print(\"Stepwise\")\n",
+    "best_step_model, best_step_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
+    "print(best_step_variables)\n",
+    "step_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
+    "regressionSummary(train_y, best_step_model.predict(step_train_X))\n",
+    "test=regressionSummary(train_y, best_step_model.predict(step_train_X))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LASSO\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : 0.2627\n",
+      "       Root Mean Squared Error (RMSE) : 6.7153\n",
+      "            Mean Absolute Error (MAE) : 4.7355\n",
+      "          Mean Percentage Error (MPE) : -8.5983\n",
+      "Mean Absolute Percentage Error (MAPE) : 23.9824\n",
+      "\n",
+      "\n",
+      "LASSO CV\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : 0.1124\n",
+      "       Root Mean Squared Error (RMSE) : 6.4186\n",
+      "            Mean Absolute Error (MAE) : 4.4592\n",
+      "          Mean Percentage Error (MPE) : -7.7091\n",
+      "Mean Absolute Percentage Error (MAPE) : 23.1854\n",
+      "Lasso-CV chosen regularization:  0.033515828458353755\n",
+      "[-0.24201538  2.81692528  8.25934245]\n",
+      "\n",
+      "\n",
+      "RIDGE\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : 0.1201\n",
+      "       Root Mean Squared Error (RMSE) : 6.4138\n",
+      "            Mean Absolute Error (MAE) : 4.4590\n",
+      "          Mean Percentage Error (MPE) : -7.6484\n",
+      "Mean Absolute Percentage Error (MAPE) : 23.1724\n",
+      "\n",
+      "\n",
+      "BAYESIAN RIDGE\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : 0.1211\n",
+      "       Root Mean Squared Error (RMSE) : 6.4144\n",
+      "            Mean Absolute Error (MAE) : 4.4603\n",
+      "          Mean Percentage Error (MPE) : -7.6595\n",
+      "Mean Absolute Percentage Error (MAPE) : 23.1747\n",
+      "Bayesian ridge chosen regularization:  1.3591395967339095\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# d iii model\n",
+    "print(\"LASSO\")\n",
+    "lasso = Lasso(alpha=1)\n",
+    "lasso.fit(train_X, train_y)\n",
+    "regressionSummary(valid_y, lasso.predict(valid_X))\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "print(\"LASSO CV\")\n",
+    "lasso_cv = LassoCV(cv=5)\n",
+    "lasso_cv.fit(train_X, train_y)\n",
+    "regressionSummary(valid_y, lasso_cv.predict(valid_X))\n",
+    "print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n",
+    "print(lasso_cv.coef_)\n",
+    "print(\"\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RIDGE\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : 0.1201\n",
+      "       Root Mean Squared Error (RMSE) : 6.4138\n",
+      "            Mean Absolute Error (MAE) : 4.4590\n",
+      "          Mean Percentage Error (MPE) : -7.6484\n",
+      "Mean Absolute Percentage Error (MAPE) : 23.1724\n",
+      "\n",
+      "\n",
+      "BAYESIAN RIDGE\n",
+      "\n",
+      "Regression statistics\n",
+      "\n",
+      "                      Mean Error (ME) : 0.1211\n",
+      "       Root Mean Squared Error (RMSE) : 6.4144\n",
+      "            Mean Absolute Error (MAE) : 4.4603\n",
+      "          Mean Percentage Error (MPE) : -7.6595\n",
+      "Mean Absolute Percentage Error (MAPE) : 23.1747\n",
+      "Bayesian ridge chosen regularization:  1.3591395967339095\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"RIDGE\")\n",
+    "ridge = Ridge(alpha=1)\n",
+    "ridge.fit(train_X, train_y)\n",
+    "regressionSummary(valid_y, ridge.predict(valid_X))\n",
+    "print(\"\\n\")\n",
+    "\n",
+    "print(\"BAYESIAN RIDGE\")\n",
+    "bayesianRidge = BayesianRidge()\n",
+    "bayesianRidge.fit(train_X, train_y)\n",
+    "regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n",
+    "print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)\n",
+    "print(\"\\n\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Best model\n",
+    "Bayesian Ridge: Lowest MAPE\n",
+    "Ridge: Lowest RMSE, lowest MAE\n",
+    "\n",
+    "Ridge or Bayesian Ridge should be used. Further parameter tuning can assist in selection which of the two models to use."
   ]
  },
  {