777 lines
36 KiB
Plaintext
777 lines
36 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Chapter 6: Multiple Linear Regression\n",
|
|
"\n",
|
|
"> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n",
|
|
">\n",
|
|
"> Code included in\n",
|
|
">\n",
|
|
"> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n",
|
|
"> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n",
|
|
"\n",
|
|
"## Import required packages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
|
|
"import statsmodels.formula.api as sm\n",
|
|
"import matplotlib.pylab as plt\n",
|
|
"\n",
|
|
"#from dmba import regressionSummary, exhaustive_search\n",
|
|
"#from dmba import backward_elimination, forward_selection, stepwise_selection\n",
|
|
"#from dmba import adjusted_r2_score, AIC_score, BIC_score\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 6.3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'pd' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# Reduce data frame to the top 1000 rows and select columns for regression analysis\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m car_df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m'\u001b[39m\u001b[39mToyotaCorolla.csv\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m car_df \u001b[39m=\u001b[39m car_df\u001b[39m.\u001b[39miloc[\u001b[39m0\u001b[39m:\u001b[39m1000\u001b[39m]\n\u001b[1;32m 5\u001b[0m predictors \u001b[39m=\u001b[39m [\u001b[39m'\u001b[39m\u001b[39mAge_08_04\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mKM\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mFuel_Type\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mHP\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mMet_Color\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mAutomatic\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mCC\u001b[39m\u001b[39m'\u001b[39m, \n\u001b[1;32m 6\u001b[0m \u001b[39m'\u001b[39m\u001b[39mDoors\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mQuarterly_Tax\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mWeight\u001b[39m\u001b[39m'\u001b[39m]\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Reduce data frame to the top 1000 rows and select columns for regression analysis\n",
|
|
"car_df = pd.read_csv('ToyotaCorolla.csv')\n",
|
|
"car_df = car_df.iloc[0:1000]\n",
|
|
"\n",
|
|
"predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', \n",
|
|
" 'Doors', 'Quarterly_Tax', 'Weight']\n",
|
|
"outcome = 'Price'\n",
|
|
"\n",
|
|
"# partition data\n",
|
|
"X = pd.get_dummies(car_df[predictors], drop_first=True)\n",
|
|
"y = car_df[outcome]\n",
|
|
"train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)\n",
|
|
"\n",
|
|
"car_lm = LinearRegression()\n",
|
|
"car_lm.fit(train_X, train_y)\n",
|
|
"\n",
|
|
"# print coefficients\n",
|
|
"print('intercept ', car_lm.intercept_)\n",
|
|
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))\n",
|
|
"\n",
|
|
"# print performance measures\n",
|
|
"regressionSummary(train_y, car_lm.predict(train_X))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"adjusted r2 : 0.8537958550253093\n",
|
|
"AIC : 10422.298278332171\n",
|
|
"BIC : 10479.45836384998\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"pred_y = car_lm.predict(train_X)\n",
|
|
"\n",
|
|
"print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, car_lm))\n",
|
|
"print('AIC : ', AIC_score(train_y, pred_y, car_lm))\n",
|
|
"print('BIC : ', BIC_score(train_y, pred_y, car_lm))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 6.4"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Predicted Actual Residual\n",
|
|
"507 10607.333940 11500 892.666060\n",
|
|
"818 9272.705792 8950 -322.705792\n",
|
|
"452 10617.947808 11450 832.052192\n",
|
|
"368 13600.396275 11450 -2150.396275\n",
|
|
"242 12396.694660 11950 -446.694660\n",
|
|
"929 9496.498212 9995 498.501788\n",
|
|
"262 12480.063217 13500 1019.936783\n",
|
|
"810 8834.146068 7950 -884.146068\n",
|
|
"318 12183.361282 9900 -2283.361282\n",
|
|
"49 19206.965683 21950 2743.034317\n",
|
|
"446 10987.498309 11950 962.501691\n",
|
|
"142 18501.527375 19950 1448.472625\n",
|
|
"968 9914.690947 9950 35.309053\n",
|
|
"345 13827.299932 14950 1122.700068\n",
|
|
"971 7966.732543 10495 2528.267457\n",
|
|
"133 17185.242041 15950 -1235.242041\n",
|
|
"104 19952.658062 19450 -502.658062\n",
|
|
"6 16570.609280 16900 329.390720\n",
|
|
"600 13739.409113 11250 -2489.409113\n",
|
|
"496 11267.513740 11750 482.486260\n",
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 103.6803\n",
|
|
" Root Mean Squared Error (RMSE) : 1312.8523\n",
|
|
" Mean Absolute Error (MAE) : 1017.5972\n",
|
|
" Mean Percentage Error (MPE) : -0.2633\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 9.0111\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Use predict() to make predictions on a new set\n",
|
|
"car_lm_pred = car_lm.predict(valid_X)\n",
|
|
"\n",
|
|
"result = pd.DataFrame({'Predicted': car_lm_pred, 'Actual': valid_y,\n",
|
|
" 'Residual': valid_y - car_lm_pred})\n",
|
|
"print(result.head(20))\n",
|
|
"\n",
|
|
"# Compute common accuracy measures\n",
|
|
"regressionSummary(valid_y, car_lm_pred)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Figure 6.1\n",
|
|
"Determine the residuals and create a histogram"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0.7425\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAAEYCAYAAAAJeGK1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFFRJREFUeJzt3X+QXWV9x/H3l0QwEiTB4BoTNFCoFUz9wYp06LQbUImAwh/qYBknVDSdqh1a09Eg0zpOtQMihbHqYKqW0NEGRBEGxx9IWX/MCEhUjIhIhAgBTEQSZJVao9/+cZ/ITbi7e/fuvdln732/Zu7sOc/5cZ/77D372XPOc58bmYkkSbXZb6YrIElSKwaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlNQDEXFHRIyMs2wkIrZ26XlGI+LN3diXVJu5M10BaaZFxBZgCPgdMAZ8CXh7Zo51us/MPKY7tZMGl2dQUsOrM3M+8CLgxcB5M1wfaeAZUFKTzPwZ8GUaQUVEHBARH4yI+yJiW0RcFhHzyrJFEXF9ROyMiEci4hsRsV9ZtiUiXl6m50XE5RGxIyJ+CLy0+TkjIiPiyKb5yyPifWV6YXmOn5ftr4+Ipa3qHhFHRsTXIuLRiHg4Iq7sQRNJ+4wBJTUpf/xfBWwuRRcCf0wjsI4ElgD/XJatAbYCh9K4RPhuoNXYYe8B/qg8TgZWTaFK+wH/CTwXeA7wOPDhcdb9F+ArwEJgKfDvU3geqToGlNTw+Yh4DLgf2A68JyICeAvwD5n5SGY+BvwrcGbZ5rfAYuC5mfnbzPxGth7c8vXA+8s+7gc+1G6lMvMXmfnZzPx1ef73A385zuq/pRFkz87M/83Mb7b7PFKNDCip4YzMPAgYAf4EWETjzOhpwMZyGW8njQ4Uh5ZtLqJxpvWViLgnItaOs+9n0wi+3X7abqUi4mkR8bGI+GlE/BL4OrAgIua0WP2dQAC3ll6Eb2r3eaQaGVBSk8z8GnA58EHgYRqX1I7JzAXlcXDpTEFmPpaZazLzCODVwDsi4qQWu30IOKxp/jl7Lf81jSDc7VlN02uA5wEvy8ynA39RyqNF3X+WmW/JzGcDfwN8tPneljTbGFDSk10KvAL4U+A/gEsi4pkAEbEkIk4u06eVjgkB/JJGN/XftdjfVcB5pcPDUuDv9lr+PeCvImJORKxkz0t4B9EIyZ0RcQiN+1ktRcTrmjpQ7KBxP6xVfaRZwYCS9pKZPweuAP4JeBeNy3g3l0tsX6VxRgNwVJkfA74FfDQzR1vs8r00LuvdS6MTw3/ttfxcGmdgO4GzgM83LbsUmEfjbO5mGpcYx/NS4JaIGAOuA87NzHsnf8VSncIvLJQk1cgzKElSlQwoSVKVDChJUpXaGiy2DKb5GI0eQbsyc7j0KLoSWAZsAV6fmTt6U01J0qBpq5NECajhzHy4qewDwCOZeUH5gOLCzHzXRPtZtGhRLlu2bHo1nsCvfvUrDjzwwJ7tvx/ZZp2x3Tpju01dP7bZxo0bH87MQydbbzpft3E6jU/dA6wHRml0yR3XsmXLuO2226bxlBMbHR1lZGRk0vX0BNusM7ZbZ2y3qevHNouItkZTafcM6l6e+ODfxzJzXUTszMwFTevsyMyFLbZdDawGGBoaOnbDhg1tvoSpGxsbY/78+T3bfz+yzTpju3XGdpu6fmyzFStWbMzM4cnWa/cM6oTMfLB8mv6GiPhRuxXJzHXAOoDh4eHs5X8C/fifRq/ZZp2x3Tpju03dILdZW734MvPB8nM7cA1wHLAtIhYDlJ/be1VJSdLgmTSgIuLAiDho9zTwSuAHNIZS2f29NquAa3tVSUnS4GnnEt8QcE1jPEzmAp/OzC9FxLeBqyLiHOA+4HW9q6YkadBMGlCZeQ/wwhblvwBafbWAJEnT5kgSkqQqGVCSpCoZUJKkKhlQkqQqTWeoI2lWWLb2Cx1tt+WCU7tcE0lT4RmUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUp+UFcax0Qf8F2zfBdnd/gB4In44WDpCZ5BSZKqZEBJkqpkQEmSqmRASZKqZEBJkqpkQEmSqmRASZKqZEBJkqpkQEmSqmRASZKqZEBJkqpkQEmSqmRASZKq5GjmUkUmGkF9Io6Crn7kGZQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKbQdURMyJiO9GxPVl/vCIuCUi7o6IKyNi/95VU5I0aKZyBnUucGfT/IXAJZl5FLADOKebFZMkDba2AioilgKnAh8v8wGcCFxdVlkPnNGLCkqSBlO7Z1CXAu8Efl/mnwHszMxdZX4rsKTLdZMkDbBJRzOPiNOA7Zm5MSJGdhe3WDXH2X41sBpgaGiI0dHRzmrahrGxsZ7uvx8NQputWb5r8pWmaGheb/bbqdnyOxyE91u3DXKbtfN1GycAr4mIU4CnAk+ncUa1ICLmlrOopcCDrTbOzHXAOoDh4eEcGRnpRr1bGh0dpZf770eD0GZnd/gVFhNZs3wXF2+q59tqtpw1MtNVaMsgvN+6bZDbbNJLfJl5XmYuzcxlwJnA/2TmWcBNwGvLaquAa3tWS0nSwJnO56DeBbwjIjbTuCf1ie5USZKkKX6jbmaOAqNl+h7guO5XSWqt02+blTQ7OZKEJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSnNnugKSZs6ytV/oaLstF5za5ZpIT+YZlCSpSpMGVEQ8NSJujYjbI+KOiHhvKT88Im6JiLsj4sqI2L/31ZUkDYp2zqB+A5yYmS8EXgSsjIjjgQuBSzLzKGAHcE7vqilJGjSTBlQ2jJXZp5RHAicCV5fy9cAZPamhJGkgRWZOvlLEHGAjcCTwEeAi4ObMPLIsPwz4Yma+oMW2q4HVAENDQ8du2LChe7Xfy9jYGPPnz+/Z/vvRbGqzTQ88OtNV+IOhebDt8ZmuxROWLzm4o+06bdNOn282vd9q0Y9ttmLFio2ZOTzZem314svM3wEviogFwDXA81utNs6264B1AMPDwzkyMtLOU3ZkdHSUXu6/H82mNju7wx5nvbBm+S4u3lRPJ9gtZ410tF2nbdrp882m91stBrnNptSLLzN3AqPA8cCCiNh9hC4FHuxu1SRJg6ydXnyHljMnImIe8HLgTuAm4LVltVXAtb2qpCRp8LRzjWIxsL7ch9oPuCozr4+IHwIbIuJ9wHeBT/SwnpKkATNpQGXm94EXtyi/BziuF5WSJMmRJCRJVTKgJElVMqAkSVWq54MckjrW6ajkUs08g5IkVcmAkiRVyYCSJFXJgJIkVcmAkiRVyYCSJFXJgJIkVcmAkiRVyQ/qap/zQ6WS2uEZlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUoGlCSpSgaUJKlKBpQkqUqTBlREHBYRN0XEnRFxR0ScW8oPiYgbIuLu8nNh76srSRoU7ZxB7QLWZObzgeOBt0XE0cBa4MbMPAq4scxLktQVkwZUZj6Umd8p048BdwJLgNOB9WW19cAZvaqkJGnwRGa2v3LEMuDrwAuA+zJzQdOyHZn5pMt8EbEaWA0wNDR07IYNG6ZZ5fGNjY0xf/78nu2/H81Em2164NF9+ny9MDQPtj0+07WYOcuXHNzRdh6jU9ePbbZixYqNmTk82XptB1REzAe+Brw/Mz8XETvbCahmw8PDedttt7X1fJ0YHR1lZGSkZ/vvRzPRZsvWfmGfPl8vrFm+i4s3zZ3pasyYLRec2tF2HqNT149tFhFtBVRbvfgi4inAZ4FPZebnSvG2iFhcli8GtndaWUmS9tZOL74APgHcmZn/1rToOmBVmV4FXNv96kmSBlU71yhOAN4IbIqI75WydwMXAFdFxDnAfcDrelNFSdIgmjSgMvObQIyz+KTuVkeSpAZHkpAkVcmAkiRVyYCSJFXJgJIkVcmAkiRVyYCSJFXJgJIkVcmAkiRVaXBHu9S09cOgr5Lq5RmUJKlKBpQkqUoGlCSpSgaUJKlKdpKQNGWddpC5fOWBXa6J+plnUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQqGVCSpCoZUJKkKhlQkqQq+Y266vjbUSWplzyDkiRVyYCSJFXJgJIkVcmAkiRVyYCSJFVp0oCKiE9GxPaI+EFT2SERcUNE3F1+LuxtNSVJg6adM6jLgZV7la0FbszMo4Aby7wkSV0zaUBl5teBR/YqPh1YX6bXA2d0uV6SpAEXmTn5ShHLgOsz8wVlfmdmLmhaviMzW17mi4jVwGqAoaGhYzds2NCFarc2NjbG/Pnze7b/2m164NEpbzM0D7Y93oPK9DnbrTOHHzxnoI/RTvTj37UVK1ZszMzhydbr+UgSmbkOWAcwPDycIyMjPXuu0dFRern/2p3dwYgQa5bv4uJNDigyVbZbZy5feeBAH6OdGOS/a5324tsWEYsBys/t3auSJEmdB9R1wKoyvQq4tjvVkSSpoZ1u5v8NfAt4XkRsjYhzgAuAV0TE3cAryrwkSV0z6UX0zHzDOItO6nJdJPW5TQ882tG90i0XnNqD2qh2jiQhSaqSASVJqpIBJUmqkgElSaqSASVJqpIBJUmqkgElSaqSASVJqpIBJUmqkgElSaqSASVJqpIBJUmqkgElSaqSXwlaoWUdjPYsSf3GMyhJUpUMKElSlQwoSVKVDChJUpXsJCGpb3Xa4civmK+DZ1CSpCoZUJKkKhlQkqQqeQ+qh/zArdQdHkuDyTMoSVKVDChJUpUMKElSlQwoSVKVDChJUpUMKElSlQwoSVKVDChJUpUMKElSlWbdSBITfaJ8zfJdnD3BckcoltSOfh8Ffba8Ps+gJElVMqAkSVUyoCRJVZp196CmwxGRJfVSL/7GTHRvfbbc8+qUZ1CSpCpNK6AiYmVE3BURmyNibbcqJUlSxwEVEXOAjwCvAo4G3hARR3erYpKkwTadM6jjgM2ZeU9m/h+wATi9O9WSJA26yMzONox4LbAyM99c5t8IvCwz377XequB1WX2ecBdnVd3UouAh3u4/35km3XGduuM7TZ1/dhmz83MQydbaTq9+KJF2ZPSLjPXAeum8Txti4jbMnN4XzxXv7DNOmO7dcZ2m7pBbrPpXOLbChzWNL8UeHB61ZEkqWE6AfVt4KiIODwi9gfOBK7rTrUkSYOu40t8mbkrIt4OfBmYA3wyM+/oWs06s08uJfYZ26wztltnbLepG9g267iThCRJveRIEpKkKhlQkqQqzbqAioh/jIiMiEVlPiLiQ2W4pe9HxEua1l0VEXeXx6qm8mMjYlPZ5kMR0arLfF+IiIsi4kelba6JiAVNy84rbXBXRJzcVN5yCKvSIeaW0p5Xls4xA8XhvfYUEYdFxE0RcWdE3BER55byQyLihvJeuSEiFpbyKR+v/Soi5kTEdyPi+jLf8viKiAPK/OayfFnTPloew30jM2fNg0a39i8DPwUWlbJTgC/S+FzW8cAtpfwQ4J7yc2GZXliW3Qr8Wdnmi8CrZvq19bDNXgnMLdMXAheW6aOB24EDgMOBn9Do7DKnTB8B7F/WObpscxVwZpm+DPjbmX59+7gtx22bQX0Ai4GXlOmDgB+X99YHgLWlfG3T+27Kx2u/PoB3AJ8Gri/zLY8v4K3AZWX6TODKMt3yGJ7p19XNx2w7g7oEeCd7fiD4dOCKbLgZWBARi4GTgRsy85HM3AHcAKwsy56emd/Kxm/5CuCMffsy9p3M/Epm7iqzN9P4vBo02m1DZv4mM+8FNtMYvqrlEFblLPNE4Oqy/Xr6uN3G4fBee8nMhzLzO2X6MeBOYAmNdllfVmt+r0zpeN2HL2WfioilwKnAx8v8RMdXc1teDZxU1h/vGO4bsyagIuI1wAOZeftei5YA9zfNby1lE5VvbVE+CN5E479XmHq7PQPY2RR2g9Ruu43XNgLKpacXA7cAQ5n5EDRCDHhmWW2q77t+dSmNf7Z/X+YnOr7+0DZl+aNl/b5vs6q+sDAivgo8q8Wi84F307hc9aTNWpRlB+Wz1kTtlpnXlnXOB3YBn9q9WYv1k9b/tPRlu3XANhhHRMwHPgv8fWb+coLbugNzXI4nIk4DtmfmxogY2V3cYtWcZFnft1lVAZWZL29VHhHLaVxjvb288ZcC34mI4xh/yKWtwMhe5aOlfGmL9Wet8dptt3LD+TTgpHJZEyYeqqpV+cM0LsfMLf/Fzfp264DDe7UQEU+hEU6fyszPleJtEbE4Mx8ql/C2l/KpHq/96ATgNRFxCvBU4Ok0zqjGO752t9nWiJgLHAw8wiC8H2f6JlgnD2ALT3SSOJU9b7reWsoPAe6lccN1YZk+pCz7dll3dyeJU2b6NfWwrVYCPwQO3av8GPa8wXoPjU4Ac8v04TzREeCYss1n2PMm7ltn+vXt47Yct20G9VGOoSuAS/cqv4g9O0l8oExP+Xjt5weNUN7dSaLl8QW8jT07SVxVplsewzP9mrraPjNdgQ5/qc0BFTS+OPEnwCZguGm9N9G4cbgZ+Oum8mHgB2WbD1NG1OjHR3nt9wPfK4/LmpadX9rgLpp6MtLoafXjsuz8pvIjaPSA3FwOpgNm+vXNQHu2bJtBfQB/TuOy0veb3mOn0LhHciNwd/m5+5/DKR+v/fzYK6BaHl80zrI+U8pvBY5o2r7lMdwvD4c6kiRVadb04pMkDRYDSpJUJQNKklQlA0qSVCUDSpJUJQNKklQlA0qSVKX/Bwv3rkJVgddRAAAAAElFTkSuQmCC",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"car_lm_pred = car_lm.predict(valid_X)\n",
|
|
"all_residuals = valid_y - car_lm_pred\n",
|
|
"\n",
|
|
"# Determine the percentage of datapoints with a residual in [-1406, 1406] = approx. 75\\%\n",
|
|
"print(len(all_residuals[(all_residuals > -1406) & (all_residuals < 1406)]) / len(all_residuals))\n",
|
|
"\n",
|
|
"ax = pd.DataFrame({'Residuals': all_residuals}).hist(bins=25)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 6.5\n",
|
|
"Run an exhaustive search. The Fuel type column is categorical and needs to be converted into dummy variables."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" n r2adj AIC Age_08_04 Automatic CC Doors Fuel_Type_Diesel \\\n",
|
|
"0 1 0.767901 10689.712094 True False False False False \n",
|
|
"1 2 0.801160 10597.910645 True False False False False \n",
|
|
"2 3 0.829659 10506.084235 True False False False False \n",
|
|
"3 4 0.846357 10445.174820 True False False False False \n",
|
|
"4 5 0.849044 10435.578836 True False False False False \n",
|
|
"5 6 0.853172 10419.932278 True False False False False \n",
|
|
"6 7 0.853860 10418.104025 True False False False True \n",
|
|
"7 8 0.854297 10417.290103 True True False False True \n",
|
|
"8 9 0.854172 10418.789079 True True False True True \n",
|
|
"9 10 0.854036 10420.330800 True True False True True \n",
|
|
"10 11 0.853796 10422.298278 True True True True True \n",
|
|
"\n",
|
|
" Fuel_Type_Petrol HP KM Met_Color Quarterly_Tax Weight \n",
|
|
"0 False False False False False False \n",
|
|
"1 False True False False False False \n",
|
|
"2 False True False False False True \n",
|
|
"3 False True True False False True \n",
|
|
"4 False True True False True True \n",
|
|
"5 True True True False True True \n",
|
|
"6 True True True False True True \n",
|
|
"7 True True True False True True \n",
|
|
"8 True True True False True True \n",
|
|
"9 True True True True True True \n",
|
|
"10 True True True True True True \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def train_model(variables):\n",
|
|
" model = LinearRegression()\n",
|
|
" model.fit(train_X[variables], train_y)\n",
|
|
" return model\n",
|
|
"\n",
|
|
"def score_model(model, variables):\n",
|
|
" pred_y = model.predict(train_X[variables])\n",
|
|
" # we negate as score is optimized to be as low as possible\n",
|
|
" return -adjusted_r2_score(train_y, pred_y, model)\n",
|
|
"\n",
|
|
"allVariables = train_X.columns\n",
|
|
"results = exhaustive_search(allVariables, train_model, score_model)\n",
|
|
"\n",
|
|
"data = []\n",
|
|
"for result in results:\n",
|
|
" model = result['model']\n",
|
|
" variables = result['variables']\n",
|
|
" AIC = AIC_score(train_y, model.predict(train_X[variables]), model)\n",
|
|
" \n",
|
|
" d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}\n",
|
|
" d.update({var: var in result['variables'] for var in allVariables})\n",
|
|
" data.append(d)\n",
|
|
"pd.set_option('display.width', 100)\n",
|
|
"print(pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables))))\n",
|
|
"pd.reset_option('display.width')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 6.6 backward elimination"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol\n",
|
|
"Start: score=10422.30\n",
|
|
"Step: score=10420.33, remove CC\n",
|
|
"Step: score=10418.79, remove Met_Color\n",
|
|
"Step: score=10417.29, remove Doors\n",
|
|
"Step: score=10417.29, remove None\n",
|
|
"['Age_08_04', 'KM', 'HP', 'Automatic', 'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def train_model(variables):\n",
|
|
" model = LinearRegression()\n",
|
|
" model.fit(train_X[variables], train_y)\n",
|
|
" return model\n",
|
|
"\n",
|
|
"def score_model(model, variables):\n",
|
|
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
|
|
"\n",
|
|
"best_model, best_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n",
|
|
"\n",
|
|
"print(best_variables)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 103.3045\n",
|
|
" Root Mean Squared Error (RMSE) : 1314.4844\n",
|
|
" Mean Absolute Error (MAE) : 1016.8875\n",
|
|
" Mean Percentage Error (MPE) : -0.2700\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 8.9984\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"regressionSummary(valid_y, best_model.predict(valid_X[best_variables]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 6.7 Forward selection"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol\n",
|
|
"Start: score=11565.07, constant\n",
|
|
"Step: score=10689.71, add Age_08_04\n",
|
|
"Step: score=10597.91, add HP\n",
|
|
"Step: score=10506.08, add Weight\n",
|
|
"Step: score=10445.17, add KM\n",
|
|
"Step: score=10435.58, add Quarterly_Tax\n",
|
|
"Step: score=10419.93, add Fuel_Type_Petrol\n",
|
|
"Step: score=10418.10, add Fuel_Type_Diesel\n",
|
|
"Step: score=10417.29, add Automatic\n",
|
|
"Step: score=10417.29, add None\n",
|
|
"['Age_08_04', 'HP', 'Weight', 'KM', 'Quarterly_Tax', 'Fuel_Type_Petrol', 'Fuel_Type_Diesel', 'Automatic']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# The initial model is the constant model - this requires special handling\n",
|
|
"# in train_model and score_model\n",
|
|
"def train_model(variables):\n",
|
|
" if len(variables) == 0:\n",
|
|
" return None\n",
|
|
" model = LinearRegression()\n",
|
|
" model.fit(train_X[variables], train_y)\n",
|
|
" return model\n",
|
|
"\n",
|
|
"def score_model(model, variables):\n",
|
|
" if len(variables) == 0:\n",
|
|
" return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n",
|
|
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
|
|
"\n",
|
|
"best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
|
|
"\n",
|
|
"print(best_variables)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol\n",
|
|
"Start: score=11565.07, constant\n",
|
|
"Step: score=10689.71, add Age_08_04\n",
|
|
"Step: score=10597.91, add HP\n",
|
|
"Step: score=10506.08, add Weight\n",
|
|
"Step: score=10445.17, add KM\n",
|
|
"Step: score=10435.58, add Quarterly_Tax\n",
|
|
"Step: score=10419.93, add Fuel_Type_Petrol\n",
|
|
"Step: score=10418.10, add Fuel_Type_Diesel\n",
|
|
"Step: score=10417.29, add Automatic\n",
|
|
"Step: score=10417.29, unchanged None\n",
|
|
"['Age_08_04', 'HP', 'Weight', 'KM', 'Quarterly_Tax', 'Fuel_Type_Petrol', 'Fuel_Type_Diesel', 'Automatic']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"best_model, best_variables = stepwise_selection(train_X.columns, train_model, score_model, verbose=True)\n",
|
|
"\n",
|
|
"print(best_variables)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table XX regularized methods"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 120.6311\n",
|
|
" Root Mean Squared Error (RMSE) : 1332.2752\n",
|
|
" Mean Absolute Error (MAE) : 1021.5286\n",
|
|
" Mean Percentage Error (MPE) : -0.2364\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 9.0115\n",
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 145.1571\n",
|
|
" Root Mean Squared Error (RMSE) : 1397.9428\n",
|
|
" Mean Absolute Error (MAE) : 1052.4649\n",
|
|
" Mean Percentage Error (MPE) : -0.2966\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 9.2918\n",
|
|
"Lasso-CV chosen regularization: 3.5138446691310588\n",
|
|
"[-1.40370575e+02 -1.76669006e-02 3.38674037e+01 0.00000000e+00\n",
|
|
" 6.94393427e+01 0.00000000e+00 0.00000000e+00 2.70913468e+00\n",
|
|
" 1.24342596e+01 -0.00000000e+00 0.00000000e+00]\n",
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 154.3286\n",
|
|
" Root Mean Squared Error (RMSE) : 1879.7426\n",
|
|
" Mean Absolute Error (MAE) : 1353.2735\n",
|
|
" Mean Percentage Error (MPE) : -2.3897\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 11.1309\n",
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 105.5382\n",
|
|
" Root Mean Squared Error (RMSE) : 1313.0217\n",
|
|
" Mean Absolute Error (MAE) : 1017.2356\n",
|
|
" Mean Percentage Error (MPE) : -0.2703\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 9.0012\n",
|
|
"Bayesian ridge chosen regularization: 0.004622833440097622\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"lasso = Lasso(normalize=True, alpha=1)\n",
|
|
"lasso.fit(train_X, train_y)\n",
|
|
"regressionSummary(valid_y, lasso.predict(valid_X))\n",
|
|
"\n",
|
|
"lasso_cv = LassoCV(normalize=True, cv=5)\n",
|
|
"lasso_cv.fit(train_X, train_y)\n",
|
|
"regressionSummary(valid_y, lasso_cv.predict(valid_X))\n",
|
|
"print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n",
|
|
"print(lasso_cv.coef_)\n",
|
|
"\n",
|
|
"ridge = Ridge(normalize=True, alpha=1)\n",
|
|
"ridge.fit(train_X, train_y)\n",
|
|
"regressionSummary(valid_y, ridge.predict(valid_X))\n",
|
|
"\n",
|
|
"bayesianRidge = BayesianRidge(normalize=True)\n",
|
|
"bayesianRidge.fit(train_X, train_y)\n",
|
|
"regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n",
|
|
"print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Regression statistics\n",
|
|
"\n",
|
|
" Mean Error (ME) : 103.6803\n",
|
|
" Root Mean Squared Error (RMSE) : 1312.8523\n",
|
|
" Mean Absolute Error (MAE) : 1017.5972\n",
|
|
" Mean Percentage Error (MPE) : -0.2633\n",
|
|
"Mean Absolute Percentage Error (MAPE) : 9.0111\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"linearRegression = LinearRegression(normalize=True).fit(train_X, train_y)\n",
|
|
"regressionSummary(valid_y, linearRegression.predict(valid_X))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>features</th>\n",
|
|
" <th>linear regression</th>\n",
|
|
" <th>lassoCV</th>\n",
|
|
" <th>bayesianRidge</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>Age_08_04</td>\n",
|
|
" <td>-140.748761</td>\n",
|
|
" <td>-140.370575</td>\n",
|
|
" <td>-139.754059</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>KM</td>\n",
|
|
" <td>-0.017840</td>\n",
|
|
" <td>-0.017667</td>\n",
|
|
" <td>-0.018131</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>HP</td>\n",
|
|
" <td>36.103419</td>\n",
|
|
" <td>33.867404</td>\n",
|
|
" <td>35.856074</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Met_Color</td>\n",
|
|
" <td>84.281830</td>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>85.088966</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Automatic</td>\n",
|
|
" <td>416.781954</td>\n",
|
|
" <td>69.439343</td>\n",
|
|
" <td>408.599781</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>CC</td>\n",
|
|
" <td>0.017737</td>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>0.020405</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>Doors</td>\n",
|
|
" <td>-50.657863</td>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>-47.917629</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>Quarterly_Tax</td>\n",
|
|
" <td>13.625325</td>\n",
|
|
" <td>2.709135</td>\n",
|
|
" <td>13.269979</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>Weight</td>\n",
|
|
" <td>13.038711</td>\n",
|
|
" <td>12.434260</td>\n",
|
|
" <td>13.114412</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>Fuel_Type_Diesel</td>\n",
|
|
" <td>1066.464681</td>\n",
|
|
" <td>-0.000000</td>\n",
|
|
" <td>955.581484</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>10</th>\n",
|
|
" <td>Fuel_Type_Petrol</td>\n",
|
|
" <td>2310.249543</td>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>2162.115762</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" features linear regression lassoCV bayesianRidge\n",
|
|
"0 Age_08_04 -140.748761 -140.370575 -139.754059\n",
|
|
"1 KM -0.017840 -0.017667 -0.018131\n",
|
|
"2 HP 36.103419 33.867404 35.856074\n",
|
|
"3 Met_Color 84.281830 0.000000 85.088966\n",
|
|
"4 Automatic 416.781954 69.439343 408.599781\n",
|
|
"5 CC 0.017737 0.000000 0.020405\n",
|
|
"6 Doors -50.657863 0.000000 -47.917629\n",
|
|
"7 Quarterly_Tax 13.625325 2.709135 13.269979\n",
|
|
"8 Weight 13.038711 12.434260 13.114412\n",
|
|
"9 Fuel_Type_Diesel 1066.464681 -0.000000 955.581484\n",
|
|
"10 Fuel_Type_Petrol 2310.249543 0.000000 2162.115762"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd.DataFrame({'features': train_X.columns, 'linear regression': linearRegression.coef_, \n",
|
|
" 'lassoCV': lasso_cv.coef_, 'bayesianRidge': bayesianRidge.coef_})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 6.10"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" OLS Regression Results \n",
|
|
"==============================================================================\n",
|
|
"Dep. Variable: Price R-squared: 0.856\n",
|
|
"Model: OLS Adj. R-squared: 0.854\n",
|
|
"Method: Least Squares F-statistic: 319.0\n",
|
|
"Date: Sun, 07 Apr 2019 Prob (F-statistic): 1.73e-239\n",
|
|
"Time: 13:53:06 Log-Likelihood: -5198.1\n",
|
|
"No. Observations: 600 AIC: 1.042e+04\n",
|
|
"Df Residuals: 588 BIC: 1.047e+04\n",
|
|
"Df Model: 11 \n",
|
|
"Covariance Type: nonrobust \n",
|
|
"====================================================================================\n",
|
|
" coef std err t P>|t| [0.025 0.975]\n",
|
|
"------------------------------------------------------------------------------------\n",
|
|
"Intercept -1319.3544 1728.427 -0.763 0.446 -4713.997 2075.288\n",
|
|
"Age_08_04 -140.7488 5.142 -27.374 0.000 -150.847 -130.650\n",
|
|
"KM -0.0178 0.002 -7.286 0.000 -0.023 -0.013\n",
|
|
"HP 36.1034 5.321 6.785 0.000 25.653 46.554\n",
|
|
"Met_Color 84.2818 127.005 0.664 0.507 -165.158 333.721\n",
|
|
"Automatic 416.7820 259.794 1.604 0.109 -93.454 927.018\n",
|
|
"CC 0.0177 0.099 0.179 0.858 -0.177 0.213\n",
|
|
"Doors -50.6579 65.187 -0.777 0.437 -178.686 77.371\n",
|
|
"Quarterly_Tax 13.6253 2.518 5.411 0.000 8.680 18.571\n",
|
|
"Weight 13.0387 1.602 8.140 0.000 9.893 16.185\n",
|
|
"Fuel_Type_Diesel 1066.4647 527.285 2.023 0.044 30.872 2102.057\n",
|
|
"Fuel_Type_Petrol 2310.2495 521.045 4.434 0.000 1286.914 3333.585\n",
|
|
"==============================================================================\n",
|
|
"Omnibus: 62.422 Durbin-Watson: 1.899\n",
|
|
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 366.046\n",
|
|
"Skew: 0.186 Prob(JB): 3.27e-80\n",
|
|
"Kurtosis: 6.808 Cond. No. 2.20e+06\n",
|
|
"==============================================================================\n",
|
|
"\n",
|
|
"Warnings:\n",
|
|
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
|
|
"[2] The condition number is large, 2.2e+06. This might indicate that there are\n",
|
|
"strong multicollinearity or other numerical problems.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# run a linear regression of Price on the remaining 11 predictors in the training set\n",
|
|
"train_df = train_X.join(train_y)\n",
|
|
"\n",
|
|
"predictors = train_X.columns\n",
|
|
"formula = 'Price ~ ' + ' + '.join(predictors)\n",
|
|
"\n",
|
|
"car_lm = sm.ols(formula=formula, data=train_df).fit()\n",
|
|
"print(car_lm.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|