QM-7063-Learning-Practice-5/Lecture-Work.ipynb
2023-02-28 13:04:00 -06:00

777 lines
36 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Chapter 6: Multiple Linear Regression\n",
"\n",
"> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n",
">\n",
"> Code included in\n",
">\n",
"> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n",
"> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n",
"\n",
"## Import required packages"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
"import statsmodels.formula.api as sm\n",
"import matplotlib.pylab as plt\n",
"\n",
"#from dmba import regressionSummary, exhaustive_search\n",
"#from dmba import backward_elimination, forward_selection, stepwise_selection\n",
"#from dmba import adjusted_r2_score, AIC_score, BIC_score\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 6.3"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# Reduce data frame to the top 1000 rows and select columns for regression analysis\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m car_df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m'\u001b[39m\u001b[39mToyotaCorolla.csv\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m car_df \u001b[39m=\u001b[39m car_df\u001b[39m.\u001b[39miloc[\u001b[39m0\u001b[39m:\u001b[39m1000\u001b[39m]\n\u001b[1;32m 5\u001b[0m predictors \u001b[39m=\u001b[39m [\u001b[39m'\u001b[39m\u001b[39mAge_08_04\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mKM\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mFuel_Type\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mHP\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mMet_Color\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mAutomatic\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mCC\u001b[39m\u001b[39m'\u001b[39m, \n\u001b[1;32m 6\u001b[0m \u001b[39m'\u001b[39m\u001b[39mDoors\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mQuarterly_Tax\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mWeight\u001b[39m\u001b[39m'\u001b[39m]\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"source": [
"# Reduce data frame to the top 1000 rows and select columns for regression analysis\n",
"car_df = pd.read_csv('ToyotaCorolla.csv')\n",
"car_df = car_df.iloc[0:1000]\n",
"\n",
"predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', \n",
" 'Doors', 'Quarterly_Tax', 'Weight']\n",
"outcome = 'Price'\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(car_df[predictors], drop_first=True)\n",
"y = car_df[outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)\n",
"\n",
"car_lm = LinearRegression()\n",
"car_lm.fit(train_X, train_y)\n",
"\n",
"# print coefficients\n",
"print('intercept ', car_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))\n",
"\n",
"# print performance measures\n",
"regressionSummary(train_y, car_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"adjusted r2 : 0.8537958550253093\n",
"AIC : 10422.298278332171\n",
"BIC : 10479.45836384998\n"
]
}
],
"source": [
"pred_y = car_lm.predict(train_X)\n",
"\n",
"print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, car_lm))\n",
"print('AIC : ', AIC_score(train_y, pred_y, car_lm))\n",
"print('BIC : ', BIC_score(train_y, pred_y, car_lm))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 6.4"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Predicted Actual Residual\n",
"507 10607.333940 11500 892.666060\n",
"818 9272.705792 8950 -322.705792\n",
"452 10617.947808 11450 832.052192\n",
"368 13600.396275 11450 -2150.396275\n",
"242 12396.694660 11950 -446.694660\n",
"929 9496.498212 9995 498.501788\n",
"262 12480.063217 13500 1019.936783\n",
"810 8834.146068 7950 -884.146068\n",
"318 12183.361282 9900 -2283.361282\n",
"49 19206.965683 21950 2743.034317\n",
"446 10987.498309 11950 962.501691\n",
"142 18501.527375 19950 1448.472625\n",
"968 9914.690947 9950 35.309053\n",
"345 13827.299932 14950 1122.700068\n",
"971 7966.732543 10495 2528.267457\n",
"133 17185.242041 15950 -1235.242041\n",
"104 19952.658062 19450 -502.658062\n",
"6 16570.609280 16900 329.390720\n",
"600 13739.409113 11250 -2489.409113\n",
"496 11267.513740 11750 482.486260\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 103.6803\n",
" Root Mean Squared Error (RMSE) : 1312.8523\n",
" Mean Absolute Error (MAE) : 1017.5972\n",
" Mean Percentage Error (MPE) : -0.2633\n",
"Mean Absolute Percentage Error (MAPE) : 9.0111\n"
]
}
],
"source": [
"# Use predict() to make predictions on a new set\n",
"car_lm_pred = car_lm.predict(valid_X)\n",
"\n",
"result = pd.DataFrame({'Predicted': car_lm_pred, 'Actual': valid_y,\n",
" 'Residual': valid_y - car_lm_pred})\n",
"print(result.head(20))\n",
"\n",
"# Compute common accuracy measures\n",
"regressionSummary(valid_y, car_lm_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Figure 6.1\n",
"Determine the residuals and create a histogram"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.7425\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAAEYCAYAAAAJeGK1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFFRJREFUeJzt3X+QXWV9x/H3l0QwEiTB4BoTNFCoFUz9wYp06LQbUImAwh/qYBknVDSdqh1a09Eg0zpOtQMihbHqYKqW0NEGRBEGxx9IWX/MCEhUjIhIhAgBTEQSZJVao9/+cZ/ITbi7e/fuvdln732/Zu7sOc/5cZ/77D372XPOc58bmYkkSbXZb6YrIElSKwaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlNQDEXFHRIyMs2wkIrZ26XlGI+LN3diXVJu5M10BaaZFxBZgCPgdMAZ8CXh7Zo51us/MPKY7tZMGl2dQUsOrM3M+8CLgxcB5M1wfaeAZUFKTzPwZ8GUaQUVEHBARH4yI+yJiW0RcFhHzyrJFEXF9ROyMiEci4hsRsV9ZtiUiXl6m50XE5RGxIyJ+CLy0+TkjIiPiyKb5yyPifWV6YXmOn5ftr4+Ipa3qHhFHRsTXIuLRiHg4Iq7sQRNJ+4wBJTUpf/xfBWwuRRcCf0wjsI4ElgD/XJatAbYCh9K4RPhuoNXYYe8B/qg8TgZWTaFK+wH/CTwXeA7wOPDhcdb9F+ArwEJgKfDvU3geqToGlNTw+Yh4DLgf2A68JyICeAvwD5n5SGY+BvwrcGbZ5rfAYuC5mfnbzPxGth7c8vXA+8s+7gc+1G6lMvMXmfnZzPx1ef73A385zuq/pRFkz87M/83Mb7b7PFKNDCip4YzMPAgYAf4EWETjzOhpwMZyGW8njQ4Uh5ZtLqJxpvWViLgnItaOs+9n0wi+3X7abqUi4mkR8bGI+GlE/BL4OrAgIua0WP2dQAC3ll6Eb2r3eaQaGVBSk8z8GnA58EHgYRqX1I7JzAXlcXDpTEFmPpaZazLzCODVwDsi4qQWu30IOKxp/jl7Lf81jSDc7VlN02uA5wEvy8ynA39RyqNF3X+WmW/JzGcDfwN8tPneljTbGFDSk10KvAL4U+A/gEsi4pkAEbEkIk4u06eVjgkB/JJGN/XftdjfVcB5pcPDUuDv9lr+PeCvImJORKxkz0t4B9EIyZ0RcQiN+1ktRcTrmjpQ7KBxP6xVfaRZwYCS9pKZPweuAP4JeBeNy3g3l0tsX6VxRgNwVJkfA74FfDQzR1vs8r00LuvdS6MTw3/ttfxcGmdgO4GzgM83LbsUmEfjbO5mGpcYx/NS4JaIGAOuA87NzHsnf8VSncIvLJQk1cgzKElSlQwoSVKVDChJUpXaGiy2DKb5GI0eQbsyc7j0KLoSWAZsAV6fmTt6U01J0qBpq5NECajhzHy4qewDwCOZeUH5gOLCzHzXRPtZtGhRLlu2bHo1nsCvfvUrDjzwwJ7tvx/ZZp2x3Tpju01dP7bZxo0bH87MQydbbzpft3E6jU/dA6wHRml0yR3XsmXLuO2226bxlBMbHR1lZGRk0vX0BNusM7ZbZ2y3qevHNouItkZTafcM6l6e+ODfxzJzXUTszMwFTevsyMyFLbZdDawGGBoaOnbDhg1tvoSpGxsbY/78+T3bfz+yzTpju3XGdpu6fmyzFStWbMzM4cnWa/cM6oTMfLB8mv6GiPhRuxXJzHXAOoDh4eHs5X8C/fifRq/ZZp2x3Tpju03dILdZW734MvPB8nM7cA1wHLAtIhYDlJ/be1VJSdLgmTSgIuLAiDho9zTwSuAHNIZS2f29NquAa3tVSUnS4GnnEt8QcE1jPEzmAp/OzC9FxLeBqyLiHOA+4HW9q6YkadBMGlCZeQ/wwhblvwBafbWAJEnT5kgSkqQqGVCSpCoZUJKkKhlQkqQqTWeoI2lWWLb2Cx1tt+WCU7tcE0lT4RmUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUp+UFcax0Qf8F2zfBdnd/gB4In44WDpCZ5BSZKqZEBJkqpkQEmSqmRASZKqZEBJkqpkQEmSqmRASZKqZEBJkqpkQEmSqmRASZKqZEBJkqpkQEmSqmRASZKq5GjmUkUmGkF9Io6Crn7kGZQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKbQdURMyJiO9GxPVl/vCIuCUi7o6IKyNi/95VU5I0aKZyBnUucGfT/IXAJZl5FLADOKebFZMkDba2AioilgKnAh8v8wGcCFxdVlkPnNGLCkqSBlO7Z1CXAu8Efl/mnwHszMxdZX4rsKTLdZMkDbBJRzOPiNOA7Zm5MSJGdhe3WDXH2X41sBpgaGiI0dHRzmrahrGxsZ7uvx8NQputWb5r8pWmaGheb/bbqdnyOxyE91u3DXKbtfN1GycAr4mIU4CnAk+ncUa1ICLmlrOopcCDrTbOzHXAOoDh4eEcGRnpRr1bGh0dpZf770eD0GZnd/gVFhNZs3wXF2+q59tqtpw1MtNVaMsgvN+6bZDbbNJLfJl5XmYuzcxlwJnA/2TmWcBNwGvLaquAa3tWS0nSwJnO56DeBbwjIjbTuCf1ie5USZKkKX6jbmaOAqNl+h7guO5XSWqt02+blTQ7OZKEJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSnNnugKSZs6ytV/oaLstF5za5ZpIT+YZlCSpSpMGVEQ8NSJujYjbI+KOiHhvKT88Im6JiLsj4sqI2L/31ZUkDYp2zqB+A5yYmS8EXgSsjIjjgQuBSzLzKGAHcE7vqilJGjSTBlQ2jJXZp5RHAicCV5fy9cAZPamhJGkgRWZOvlLEHGAjcCTwEeAi4ObMPLIsPwz4Yma+oMW2q4HVAENDQ8du2LChe7Xfy9jYGPPnz+/Z/vvRbGqzTQ88OtNV+IOhebDt8ZmuxROWLzm4o+06bdNOn282vd9q0Y9ttmLFio2ZOTzZem314svM3wEviogFwDXA81utNs6264B1AMPDwzkyMtLOU3ZkdHSUXu6/H82mNju7wx5nvbBm+S4u3lRPJ9gtZ410tF2nbdrp882m91stBrnNptSLLzN3AqPA8cCCiNh9hC4FHuxu1SRJg6ydXnyHljMnImIe8HLgTuAm4LVltVXAtb2qpCRp8LRzjWIxsL7ch9oPuCozr4+IHwIbIuJ9wHeBT/SwnpKkATNpQGXm94EXtyi/BziuF5WSJMmRJCRJVTKgJElVMqAkSVWq54MckjrW6ajkUs08g5IkVcmAkiRVyYCSJFXJgJIkVcmAkiRVyYCSJFXJgJIkVcmAkiRVyQ/qap/zQ6WS2uEZlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUqTBlREHBYRN0XEnRFxR0ScW8oPiYgbIuLu8nNh76srSRoU7ZxB7QLWZObzgeOBt0XE0cBa4MbMPAq4scxLktQVkwZUZj6Umd8p048BdwJLgNOB9WW19cAZvaqkJGnwRGa2v3LEMuDrwAuA+zJzQdOyHZn5pMt8EbEaWA0wNDR07IYNG6ZZ5fGNjY0xf/78nu2/H81Em2164NF9+ny9MDQPtj0+07WYOcuXHNzRdh6jU9ePbbZixYqNmTk82XptB1REzAe+Brw/Mz8XETvbCahmw8PDedttt7X1fJ0YHR1lZGSkZ/vvRzPRZsvWfmGfPl8vrFm+i4s3zZ3pasyYLRec2tF2HqNT149tFhFtBVRbvfgi4inAZ4FPZebnSvG2iFhcli8GtndaWUmS9tZOL74APgHcmZn/1rToOmBVmV4FXNv96kmSBlU71yhOAN4IbIqI75WydwMXAFdFxDnAfcDrelNFSdIgmjSgMvObQIyz+KTuVkeSpAZHkpAkVcmAkiRVyYCSJFXJgJIkVcmAkiRVyYCSJFXJgJIkVcmAkiRVaXBHu9S09cOgr5Lq5RmUJKlKBpQkqUoGlCSpSgaUJKlKdpKQNGWddpC5fOWBXa6J+plnUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQq+Y266vjbUSWplzyDkiRVyYCSJFXJgJIkVcmAkiRVyYCSJFVp0oCKiE9GxPaI+EFT2SERcUNE3F1+LuxtNSVJg6adM6jLgZV7la0FbszMo4Aby7wkSV0zaUBl5teBR/YqPh1YX6bXA2d0uV6SpAEXmTn5ShHLgOsz8wVlfmdmLmhaviMzW17mi4jVwGqAoaGhYzds2NCFarc2NjbG/Pnze7b/2m164NEpbzM0D7Y93oPK9DnbrTOHHzxnoI/RTvTj37UVK1ZszMzhydbr+UgSmbkOWAcwPDycIyMjPXuu0dFRern/2p3dwYgQa5bv4uJNDigyVbZbZy5feeBAH6OdGOS/a5324tsWEYsBys/t3auSJEmdB9R1wKoyvQq4tjvVkSSpoZ1u5v8NfAt4XkRsjYhzgAuAV0TE3cAryrwkSV0z6UX0zHzDOItO6nJdJPW5TQ882tG90i0XnNqD2qh2jiQhSaqSASVJqpIBJUmqkgElSaqSASVJqpIBJUmqkgElSaqSASVJqpIBJUmqkgElSaqSASVJqpIBJUmqkgElSaqSXwlaoWUdjPYsSf3GMyhJUpUMKElSlQwoSVKVDChJUpXsJCGpb3Xa4civmK+DZ1CSpCoZUJKkKhlQkqQqeQ+qh/zArdQdHkuDyTMoSVKVDChJUpUMKElSlQwoSVKVDChJUpUMKElSlQwoSVKVDChJUpUMKElSlWbdSBITfaJ8zfJdnD3BckcoltSOfh8Ffba8Ps+gJElVMqAkSVUyoCRJVZp196CmwxGRJfVSL/7GTHRvfbbc8+qUZ1CSpCpNK6AiYmVE3BURmyNibbcqJUlSxwEVEXOAjwCvAo4G3hARR3erYpKkwTadM6jjgM2ZeU9m/h+wATi9O9WSJA26yMzONox4LbAyM99c5t8IvCwz377XequB1WX2ecBdnVd3UouAh3u4/35km3XGduuM7TZ1/dhmz83MQydbaTq9+KJF2ZPSLjPXAeum8Txti4jbMnN4XzxXv7DNOmO7dcZ2m7pBbrPpXOLbChzWNL8UeHB61ZEkqWE6AfVt4KiIODwi9gfOBK7rTrUkSYOu40t8mbkrIt4OfBmYA3wyM+/oWs06s08uJfYZ26wztltnbLepG9g267iThCRJveRIEpKkKhlQkqQqzbqAioh/jIiMiEVlPiLiQ2W4pe9HxEua1l0VEXeXx6qm8mMjYlPZ5kMR0arLfF+IiIsi4kelba6JiAVNy84rbXBXRJzcVN5yCKvSIeaW0p5Xls4xA8XhvfYUEYdFxE0RcWdE3BER55byQyLihvJeuSEiFpbyKR+v/Soi5kTEdyPi+jLf8viKiAPK/OayfFnTPloew30jM2fNg0a39i8DPwUWlbJTgC/S+FzW8cAtpfwQ4J7yc2GZXliW3Qr8Wdnmi8CrZvq19bDNXgnMLdMXAheW6aOB24EDgMOBn9Do7DKnTB8B7F/WObpscxVwZpm+DPjbmX59+7gtx22bQX0Ai4GXlOmDgB+X99YHgLWlfG3T+27Kx2u/PoB3AJ8Gri/zLY8v4K3AZWX6TODKMt3yGJ7p19XNx2w7g7oEeCd7fiD4dOCKbLgZWBARi4GTgRsy85HM3AHcAKwsy56emd/Kxm/5CuCMffsy9p3M/Epm7iqzN9P4vBo02m1DZv4mM+8FNtMYvqrlEFblLPNE4Oqy/Xr6uN3G4fBee8nMhzLzO2X6MeBOYAmNdllfVmt+r0zpeN2HL2WfioilwKnAx8v8RMdXc1teDZxU1h/vGO4bsyagIuI1wAOZeftei5YA9zfNby1lE5VvbVE+CN5E479XmHq7PQPY2RR2g9Ruu43XNgLKpacXA7cAQ5n5EDRCDHhmWW2q77t+dSmNf7Z/X+YnOr7+0DZl+aNl/b5vs6q+sDAivgo8q8Wi84F307hc9aTNWpRlB+Wz1kTtlpnXlnXOB3YBn9q9WYv1k9b/tPRlu3XANhhHRMwHPgv8fWb+coLbugNzXI4nIk4DtmfmxogY2V3cYtWcZFnft1lVAZWZL29VHhHLaVxjvb288ZcC34mI4xh/yKWtwMhe5aOlfGmL9Wet8dptt3LD+TTgpHJZEyYeqqpV+cM0LsfMLf/Fzfp264DDe7UQEU+hEU6fyszPleJtEbE4Mx8ql/C2l/KpHq/96ATgNRFxCvBU4Ok0zqjGO752t9nWiJgLHAw8wiC8H2f6JlgnD2ALT3SSOJU9b7reWsoPAe6lccN1YZk+pCz7dll3dyeJU2b6NfWwrVYCPwQO3av8GPa8wXoPjU4Ac8v04TzREeCYss1n2PMm7ltn+vXt47Yct20G9VGOoSuAS/cqv4g9O0l8oExP+Xjt5weNUN7dSaLl8QW8jT07SVxVplsewzP9mrraPjNdgQ5/qc0BFTS+OPEnwCZguGm9N9G4cbgZ+Oum8mHgB2WbD1NG1OjHR3nt9wPfK4/LmpadX9rgLpp6MtLoafXjsuz8pvIjaPSA3FwOpgNm+vXNQHu2bJtBfQB/TuOy0veb3mOn0LhHciNwd/m5+5/DKR+v/fzYK6BaHl80zrI+U8pvBY5o2r7lMdwvD4c6kiRVadb04pMkDRYDSpJUJQNKklQlA0qSVCUDSpJUJQNKklQlA0qSVKX/Bwv3rkJVgddRAAAAAElFTkSuQmCC",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"car_lm_pred = car_lm.predict(valid_X)\n",
"all_residuals = valid_y - car_lm_pred\n",
"\n",
"# Determine the percentage of datapoints with a residual in [-1406, 1406] = approx. 75\\%\n",
"print(len(all_residuals[(all_residuals > -1406) & (all_residuals < 1406)]) / len(all_residuals))\n",
"\n",
"ax = pd.DataFrame({'Residuals': all_residuals}).hist(bins=25)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 6.5\n",
"Run an exhaustive search. The Fuel type column is categorical and needs to be converted into dummy variables."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" n r2adj AIC Age_08_04 Automatic CC Doors Fuel_Type_Diesel \\\n",
"0 1 0.767901 10689.712094 True False False False False \n",
"1 2 0.801160 10597.910645 True False False False False \n",
"2 3 0.829659 10506.084235 True False False False False \n",
"3 4 0.846357 10445.174820 True False False False False \n",
"4 5 0.849044 10435.578836 True False False False False \n",
"5 6 0.853172 10419.932278 True False False False False \n",
"6 7 0.853860 10418.104025 True False False False True \n",
"7 8 0.854297 10417.290103 True True False False True \n",
"8 9 0.854172 10418.789079 True True False True True \n",
"9 10 0.854036 10420.330800 True True False True True \n",
"10 11 0.853796 10422.298278 True True True True True \n",
"\n",
" Fuel_Type_Petrol HP KM Met_Color Quarterly_Tax Weight \n",
"0 False False False False False False \n",
"1 False True False False False False \n",
"2 False True False False False True \n",
"3 False True True False False True \n",
"4 False True True False True True \n",
"5 True True True False True True \n",
"6 True True True False True True \n",
"7 True True True False True True \n",
"8 True True True False True True \n",
"9 True True True True True True \n",
"10 True True True True True True \n"
]
}
],
"source": [
"def train_model(variables):\n",
" model = LinearRegression()\n",
" model.fit(train_X[variables], train_y)\n",
" return model\n",
"\n",
"def score_model(model, variables):\n",
" pred_y = model.predict(train_X[variables])\n",
" # we negate as score is optimized to be as low as possible\n",
" return -adjusted_r2_score(train_y, pred_y, model)\n",
"\n",
"allVariables = train_X.columns\n",
"results = exhaustive_search(allVariables, train_model, score_model)\n",
"\n",
"data = []\n",
"for result in results:\n",
" model = result['model']\n",
" variables = result['variables']\n",
" AIC = AIC_score(train_y, model.predict(train_X[variables]), model)\n",
" \n",
" d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}\n",
" d.update({var: var in result['variables'] for var in allVariables})\n",
" data.append(d)\n",
"pd.set_option('display.width', 100)\n",
"print(pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables))))\n",
"pd.reset_option('display.width')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 6.6 backward elimination"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol\n",
"Start: score=10422.30\n",
"Step: score=10420.33, remove CC\n",
"Step: score=10418.79, remove Met_Color\n",
"Step: score=10417.29, remove Doors\n",
"Step: score=10417.29, remove None\n",
"['Age_08_04', 'KM', 'HP', 'Automatic', 'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol']\n"
]
}
],
"source": [
"def train_model(variables):\n",
" model = LinearRegression()\n",
" model.fit(train_X[variables], train_y)\n",
" return model\n",
"\n",
"def score_model(model, variables):\n",
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
"\n",
"best_model, best_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n",
"\n",
"print(best_variables)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 103.3045\n",
" Root Mean Squared Error (RMSE) : 1314.4844\n",
" Mean Absolute Error (MAE) : 1016.8875\n",
" Mean Percentage Error (MPE) : -0.2700\n",
"Mean Absolute Percentage Error (MAPE) : 8.9984\n"
]
}
],
"source": [
"regressionSummary(valid_y, best_model.predict(valid_X[best_variables]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 6.7 Forward selection"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol\n",
"Start: score=11565.07, constant\n",
"Step: score=10689.71, add Age_08_04\n",
"Step: score=10597.91, add HP\n",
"Step: score=10506.08, add Weight\n",
"Step: score=10445.17, add KM\n",
"Step: score=10435.58, add Quarterly_Tax\n",
"Step: score=10419.93, add Fuel_Type_Petrol\n",
"Step: score=10418.10, add Fuel_Type_Diesel\n",
"Step: score=10417.29, add Automatic\n",
"Step: score=10417.29, add None\n",
"['Age_08_04', 'HP', 'Weight', 'KM', 'Quarterly_Tax', 'Fuel_Type_Petrol', 'Fuel_Type_Diesel', 'Automatic']\n"
]
}
],
"source": [
"# The initial model is the constant model - this requires special handling\n",
"# in train_model and score_model\n",
"def train_model(variables):\n",
" if len(variables) == 0:\n",
" return None\n",
" model = LinearRegression()\n",
" model.fit(train_X[variables], train_y)\n",
" return model\n",
"\n",
"def score_model(model, variables):\n",
" if len(variables) == 0:\n",
" return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n",
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
"\n",
"best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
"\n",
"print(best_variables)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol\n",
"Start: score=11565.07, constant\n",
"Step: score=10689.71, add Age_08_04\n",
"Step: score=10597.91, add HP\n",
"Step: score=10506.08, add Weight\n",
"Step: score=10445.17, add KM\n",
"Step: score=10435.58, add Quarterly_Tax\n",
"Step: score=10419.93, add Fuel_Type_Petrol\n",
"Step: score=10418.10, add Fuel_Type_Diesel\n",
"Step: score=10417.29, add Automatic\n",
"Step: score=10417.29, unchanged None\n",
"['Age_08_04', 'HP', 'Weight', 'KM', 'Quarterly_Tax', 'Fuel_Type_Petrol', 'Fuel_Type_Diesel', 'Automatic']\n"
]
}
],
"source": [
"best_model, best_variables = stepwise_selection(train_X.columns, train_model, score_model, verbose=True)\n",
"\n",
"print(best_variables)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table XX regularized methods"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 120.6311\n",
" Root Mean Squared Error (RMSE) : 1332.2752\n",
" Mean Absolute Error (MAE) : 1021.5286\n",
" Mean Percentage Error (MPE) : -0.2364\n",
"Mean Absolute Percentage Error (MAPE) : 9.0115\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 145.1571\n",
" Root Mean Squared Error (RMSE) : 1397.9428\n",
" Mean Absolute Error (MAE) : 1052.4649\n",
" Mean Percentage Error (MPE) : -0.2966\n",
"Mean Absolute Percentage Error (MAPE) : 9.2918\n",
"Lasso-CV chosen regularization: 3.5138446691310588\n",
"[-1.40370575e+02 -1.76669006e-02 3.38674037e+01 0.00000000e+00\n",
" 6.94393427e+01 0.00000000e+00 0.00000000e+00 2.70913468e+00\n",
" 1.24342596e+01 -0.00000000e+00 0.00000000e+00]\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 154.3286\n",
" Root Mean Squared Error (RMSE) : 1879.7426\n",
" Mean Absolute Error (MAE) : 1353.2735\n",
" Mean Percentage Error (MPE) : -2.3897\n",
"Mean Absolute Percentage Error (MAPE) : 11.1309\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 105.5382\n",
" Root Mean Squared Error (RMSE) : 1313.0217\n",
" Mean Absolute Error (MAE) : 1017.2356\n",
" Mean Percentage Error (MPE) : -0.2703\n",
"Mean Absolute Percentage Error (MAPE) : 9.0012\n",
"Bayesian ridge chosen regularization: 0.004622833440097622\n"
]
}
],
"source": [
"lasso = Lasso(normalize=True, alpha=1)\n",
"lasso.fit(train_X, train_y)\n",
"regressionSummary(valid_y, lasso.predict(valid_X))\n",
"\n",
"lasso_cv = LassoCV(normalize=True, cv=5)\n",
"lasso_cv.fit(train_X, train_y)\n",
"regressionSummary(valid_y, lasso_cv.predict(valid_X))\n",
"print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n",
"print(lasso_cv.coef_)\n",
"\n",
"ridge = Ridge(normalize=True, alpha=1)\n",
"ridge.fit(train_X, train_y)\n",
"regressionSummary(valid_y, ridge.predict(valid_X))\n",
"\n",
"bayesianRidge = BayesianRidge(normalize=True)\n",
"bayesianRidge.fit(train_X, train_y)\n",
"regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n",
"print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 103.6803\n",
" Root Mean Squared Error (RMSE) : 1312.8523\n",
" Mean Absolute Error (MAE) : 1017.5972\n",
" Mean Percentage Error (MPE) : -0.2633\n",
"Mean Absolute Percentage Error (MAPE) : 9.0111\n"
]
}
],
"source": [
"linearRegression = LinearRegression(normalize=True).fit(train_X, train_y)\n",
"regressionSummary(valid_y, linearRegression.predict(valid_X))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>features</th>\n",
" <th>linear regression</th>\n",
" <th>lassoCV</th>\n",
" <th>bayesianRidge</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Age_08_04</td>\n",
" <td>-140.748761</td>\n",
" <td>-140.370575</td>\n",
" <td>-139.754059</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>KM</td>\n",
" <td>-0.017840</td>\n",
" <td>-0.017667</td>\n",
" <td>-0.018131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>HP</td>\n",
" <td>36.103419</td>\n",
" <td>33.867404</td>\n",
" <td>35.856074</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Met_Color</td>\n",
" <td>84.281830</td>\n",
" <td>0.000000</td>\n",
" <td>85.088966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Automatic</td>\n",
" <td>416.781954</td>\n",
" <td>69.439343</td>\n",
" <td>408.599781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>CC</td>\n",
" <td>0.017737</td>\n",
" <td>0.000000</td>\n",
" <td>0.020405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Doors</td>\n",
" <td>-50.657863</td>\n",
" <td>0.000000</td>\n",
" <td>-47.917629</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Quarterly_Tax</td>\n",
" <td>13.625325</td>\n",
" <td>2.709135</td>\n",
" <td>13.269979</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Weight</td>\n",
" <td>13.038711</td>\n",
" <td>12.434260</td>\n",
" <td>13.114412</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Fuel_Type_Diesel</td>\n",
" <td>1066.464681</td>\n",
" <td>-0.000000</td>\n",
" <td>955.581484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Fuel_Type_Petrol</td>\n",
" <td>2310.249543</td>\n",
" <td>0.000000</td>\n",
" <td>2162.115762</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" features linear regression lassoCV bayesianRidge\n",
"0 Age_08_04 -140.748761 -140.370575 -139.754059\n",
"1 KM -0.017840 -0.017667 -0.018131\n",
"2 HP 36.103419 33.867404 35.856074\n",
"3 Met_Color 84.281830 0.000000 85.088966\n",
"4 Automatic 416.781954 69.439343 408.599781\n",
"5 CC 0.017737 0.000000 0.020405\n",
"6 Doors -50.657863 0.000000 -47.917629\n",
"7 Quarterly_Tax 13.625325 2.709135 13.269979\n",
"8 Weight 13.038711 12.434260 13.114412\n",
"9 Fuel_Type_Diesel 1066.464681 -0.000000 955.581484\n",
"10 Fuel_Type_Petrol 2310.249543 0.000000 2162.115762"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({'features': train_X.columns, 'linear regression': linearRegression.coef_, \n",
" 'lassoCV': lasso_cv.coef_, 'bayesianRidge': bayesianRidge.coef_})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 6.10"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: Price R-squared: 0.856\n",
"Model: OLS Adj. R-squared: 0.854\n",
"Method: Least Squares F-statistic: 319.0\n",
"Date: Sun, 07 Apr 2019 Prob (F-statistic): 1.73e-239\n",
"Time: 13:53:06 Log-Likelihood: -5198.1\n",
"No. Observations: 600 AIC: 1.042e+04\n",
"Df Residuals: 588 BIC: 1.047e+04\n",
"Df Model: 11 \n",
"Covariance Type: nonrobust \n",
"====================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------------\n",
"Intercept -1319.3544 1728.427 -0.763 0.446 -4713.997 2075.288\n",
"Age_08_04 -140.7488 5.142 -27.374 0.000 -150.847 -130.650\n",
"KM -0.0178 0.002 -7.286 0.000 -0.023 -0.013\n",
"HP 36.1034 5.321 6.785 0.000 25.653 46.554\n",
"Met_Color 84.2818 127.005 0.664 0.507 -165.158 333.721\n",
"Automatic 416.7820 259.794 1.604 0.109 -93.454 927.018\n",
"CC 0.0177 0.099 0.179 0.858 -0.177 0.213\n",
"Doors -50.6579 65.187 -0.777 0.437 -178.686 77.371\n",
"Quarterly_Tax 13.6253 2.518 5.411 0.000 8.680 18.571\n",
"Weight 13.0387 1.602 8.140 0.000 9.893 16.185\n",
"Fuel_Type_Diesel 1066.4647 527.285 2.023 0.044 30.872 2102.057\n",
"Fuel_Type_Petrol 2310.2495 521.045 4.434 0.000 1286.914 3333.585\n",
"==============================================================================\n",
"Omnibus: 62.422 Durbin-Watson: 1.899\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 366.046\n",
"Skew: 0.186 Prob(JB): 3.27e-80\n",
"Kurtosis: 6.808 Cond. No. 2.20e+06\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 2.2e+06. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n"
]
}
],
"source": [
"# run a linear regression of Price on the remaining 11 predictors in the training set\n",
"train_df = train_X.join(train_y)\n",
"\n",
"predictors = train_X.columns\n",
"formula = 'Price ~ ' + ' + '.join(predictors)\n",
"\n",
"car_lm = sm.ols(formula=formula, data=train_df).fit()\n",
"print(car_lm.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}