From 58b92cfb7a242f795e74d8b421d63e07fbee8b37 Mon Sep 17 00:00:00 2001 From: noah Date: Tue, 28 Feb 2023 12:48:35 -0600 Subject: [PATCH] Conceptual responses and residuals --- .~lock.Tayko.csv# | 1 - Schrick-Noah_Learning-Practice-5.ipynb | 174 ++++++++++++++++++++++--- 2 files changed, 159 insertions(+), 16 deletions(-) delete mode 100644 .~lock.Tayko.csv# diff --git a/.~lock.Tayko.csv# b/.~lock.Tayko.csv# deleted file mode 100644 index 026fb0e..0000000 --- a/.~lock.Tayko.csv# +++ /dev/null @@ -1 +0,0 @@ -,noah,NovaArchSys,27.02.2023 17:02,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_Learning-Practice-5.ipynb b/Schrick-Noah_Learning-Practice-5.ipynb index a124100..4e0ff37 100644 --- a/Schrick-Noah_Learning-Practice-5.ipynb +++ b/Schrick-Noah_Learning-Practice-5.ipynb @@ -644,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -659,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 115, "metadata": {}, "outputs": [ { @@ -673,19 +673,13 @@ "2 last_update_days_ago -0.010374\n", "3 Web order 18.628731\n", "4 Gender=male -9.111366\n", - "5 Address_is_res -75.815354\n" - ] - }, - { - "ename": "ValueError", - "evalue": "operands could not be broadcast together with shapes (800,) (1200,) ", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[111], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[39mprint\u001b[39m(pd\u001b[39m.\u001b[39mDataFrame({\u001b[39m'\u001b[39m\u001b[39mPredictor\u001b[39m\u001b[39m'\u001b[39m: X\u001b[39m.\u001b[39mcolumns, \u001b[39m'\u001b[39m\u001b[39mcoefficient\u001b[39m\u001b[39m'\u001b[39m: tayko_lm\u001b[39m.\u001b[39mcoef_}))\n\u001b[1;32m 9\u001b[0m \u001b[39m# print performance measures\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m regressionSummary(valid_y, tayko_lm\u001b[39m.\u001b[39;49mpredict(train_X))\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/dmba/metric.py:71\u001b[0m, in \u001b[0;36mregressionSummary\u001b[0;34m(y_true, y_pred)\u001b[0m\n\u001b[1;32m 69\u001b[0m y_true \u001b[39m=\u001b[39m _toArray(y_true)\n\u001b[1;32m 70\u001b[0m y_pred \u001b[39m=\u001b[39m _toArray(y_pred)\n\u001b[0;32m---> 71\u001b[0m y_res \u001b[39m=\u001b[39m y_true \u001b[39m-\u001b[39;49m y_pred\n\u001b[1;32m 72\u001b[0m metrics \u001b[39m=\u001b[39m [\n\u001b[1;32m 73\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mMean Error (ME)\u001b[39m\u001b[39m'\u001b[39m, \u001b[39msum\u001b[39m(y_res) \u001b[39m/\u001b[39m \u001b[39mlen\u001b[39m(y_res)),\n\u001b[1;32m 74\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mRoot Mean Squared Error (RMSE)\u001b[39m\u001b[39m'\u001b[39m, math\u001b[39m.\u001b[39msqrt(mean_squared_error(y_true, y_pred))),\n\u001b[1;32m 75\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mMean Absolute Error (MAE)\u001b[39m\u001b[39m'\u001b[39m, \u001b[39msum\u001b[39m(\u001b[39mabs\u001b[39m(y_res)) \u001b[39m/\u001b[39m \u001b[39mlen\u001b[39m(y_res)),\n\u001b[1;32m 76\u001b[0m ]\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39m(yt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mfor\u001b[39;00m yt \u001b[39min\u001b[39;00m y_true):\n", - "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (800,) (1200,) " + "5 Address_is_res -75.815354\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 7.1933\n", + "Root Mean Squared Error (RMSE) : 136.7397\n", + " Mean Absolute Error (MAE) : 83.6010\n" ] } ], @@ -701,6 +695,156 @@ "# print performance measures\n", "regressionSummary(valid_y, tayko_lm.predict(valid_X))" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# iii Based on this model, what type of purchaser is most likely to spend a large amount of money?\n", + "Women outside the US that do not have a residential address, that place web orders, and made many transactions the previous year." + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Backward\n", + "Variables: US, Freq, last_update_days_ago, Web order, Gender=male, Address_is_res\n", + "Start: score=15028.53\n", + "Step: score=15026.76, remove US\n", + "Step: score=15026.38, remove Gender=male\n", + "Step: score=15026.38, remove None\n", + "['Freq', 'last_update_days_ago', 'Web order', 'Address_is_res']\n", + "\n" + ] + } + ], + "source": [ + "#iv. If we used backward elimination to reduce the number\n", + "# of predictors, which predictor would be dropped first \n", + "# from the model?\n", + "\n", + "def train_model(variables):\n", + " if len(variables) == 0:\n", + " return None\n", + " model = LinearRegression()\n", + " model.fit(train_X[variables], train_y)\n", + " return model\n", + "\n", + "def score_model(model, variables):\n", + " if len(variables) == 0:\n", + " return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n", + " return AIC_score(train_y, model.predict(train_X[variables]), model)\n", + "\n", + "print(\"Backward\")\n", + "best_back_model, best_back_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n", + "print(best_back_variables)\n", + "\n", + "# 'US' dropped first" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# v. Show how the prediction and the prediction error are computed for the first purchase in the validation set.\n", + "\n", + "After the model is trained, we have the regression coefficients.\n", + "Using these, we can multiply them with the new predictor values.\n", + "Using the sample of the first purchase, each predictor is multiplied by the coefficients to compute the prediction.\n", + "\n", + "The error is obtained by comparing the predicted value to the actual value." + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Predicted Actual Residual\n", + "674 89.214915 0 -89.214915\n", + "1699 202.231362 184 -18.231362\n", + "1282 49.159303 0 -49.159303\n", + "1315 824.841659 1289 464.158341\n", + "1210 0.121196 0 -0.121196\n", + "1636 86.766675 0 -86.766675\n", + "613 58.018614 0 -58.018614\n", + "447 247.428569 1255 1007.571431\n", + "1131 67.036615 0 -67.036615\n", + "808 67.825031 0 -67.825031\n", + "1496 -7.098168 0 7.098168\n", + "1468 194.814024 411 216.185976\n", + "1682 -13.480101 0 13.480101\n", + "1149 -32.457046 0 32.457046\n", + "442 61.247979 0 -61.247979\n", + "1813 4.497885 173 168.502115\n", + "654 -46.046854 0 46.046854\n", + "1264 -32.315195 0 32.315195\n", + "858 80.219048 0 -80.219048\n", + "1482 51.783900 0 -51.783900\n", + "\n", + "Regression statistics\n", + "\n", + " Mean Error (ME) : 7.1933\n", + "Root Mean Squared Error (RMSE) : 136.7397\n", + " Mean Absolute Error (MAE) : 83.6010\n" + ] + } + ], + "source": [ + "#vi. Evaluate the predictive accuracy of the model by\n", + "# examining its performance on the validation set.\n", + "\n", + "tayko_lm_pred = tayko_lm.predict(valid_X)\n", + "\n", + "result = pd.DataFrame({'Predicted': tayko_lm_pred, 'Actual': valid_y,\n", + " 'Residual': valid_y - tayko_lm_pred})\n", + "print(result.head(20))\n", + "\n", + "# Compute common accuracy measures\n", + "regressionSummary(valid_y, tayko_lm_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#vii. Create a histogram of the model residuals. \n", + "# Do they appear to follow a normal distribution? \n", + "# How does this affect the predictive performance of the model?\n", + "\n", + "tayko_lm_pred = tayko_lm.predict(valid_X)\n", + "all_residuals = valid_y - tayko_lm_pred\n", + "\n", + "ax = pd.DataFrame({'Residuals': all_residuals}).hist(bins=25)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] } ], "metadata": {